gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2021 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #define INCLUDE_ALGORITHM
  23 #include "config.h"
  24 #include "system.h"
  25 #include "coretypes.h"
  26 #include "backend.h"
  27 #include "target.h"
  28 #include "rtl.h"
  29 #include "tree.h"
  30 #include "gimple.h"
  31 #include "cfghooks.h"
  32 #include "tree-pass.h"
  33 #include "ssa.h"
  34 #include "optabs-tree.h"
  35 #include "diagnostic-core.h"
  36 #include "fold-const.h"
  37 #include "stor-layout.h"
  38 #include "cfganal.h"
  39 #include "gimplify.h"
  40 #include "gimple-iterator.h"
  41 #include "gimplify-me.h"
  42 #include "tree-ssa-loop-ivopts.h"
  43 #include "tree-ssa-loop-manip.h"
  44 #include "tree-ssa-loop-niter.h"
  45 #include "tree-ssa-loop.h"
  46 #include "cfgloop.h"
  47 #include "tree-scalar-evolution.h"
  48 #include "tree-vectorizer.h"
  49 #include "gimple-fold.h"
  50 #include "cgraph.h"
  51 #include "tree-cfg.h"
  52 #include "tree-if-conv.h"
  53 #include "internal-fn.h"
  54 #include "tree-vector-builder.h"
  55 #include "vec-perm-indices.h"
  56 #include "tree-eh.h"
  57
  58 /* Loop Vectorization Pass.
  59
  60    This pass tries to vectorize loops.
  61
  62    For example, the vectorizer transforms the following simple loop:
  63
  64         short a[N]; short b[N]; short c[N]; int i;
  65
  66         for (i=0; i<N; i++){
  67           a[i] = b[i] + c[i];
  68         }
  69
  70    as if it was manually vectorized by rewriting the source code into:
  71
  72         typedef int __attribute__((mode(V8HI))) v8hi;
  73         short a[N];  short b[N]; short c[N];   int i;
  74         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  75         v8hi va, vb, vc;
  76
  77         for (i=0; i<N/8; i++){
  78           vb = pb[i];
  79           vc = pc[i];
  80           va = vb + vc;
  81           pa[i] = va;
  82         }
  83
  84         The main entry to this pass is vectorize_loops(), in which
  85    the vectorizer applies a set of analyses on a given set of loops,
  86    followed by the actual vectorization transformation for the loops that
  87    had successfully passed the analysis phase.
  88         Throughout this pass we make a distinction between two types of
  89    data: scalars (which are represented by SSA_NAMES), and memory references
  90    ("data-refs").  These two types of data require different handling both
  91    during analysis and transformation. The types of data-refs that the
  92    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  93    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  94    accesses are required to have a simple (consecutive) access pattern.
  95
  96    Analysis phase:
  97    ===============
  98         The driver for the analysis phase is vect_analyze_loop().
  99    It applies a set of analyses, some of which rely on the scalar evolution
 100    analyzer (scev) developed by Sebastian Pop.
 101
 102         During the analysis phase the vectorizer records some information
 103    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 104    loop, as well as general information about the loop as a whole, which is
 105    recorded in a "loop_vec_info" struct attached to each loop.
 106
 107    Transformation phase:
 108    =====================
 109         The loop transformation phase scans all the stmts in the loop, and
 110    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 111    the loop that needs to be vectorized.  It inserts the vector code sequence
 112    just before the scalar stmt S, and records a pointer to the vector code
 113    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 114    attached to S).  This pointer will be used for the vectorization of following
 115    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 116    otherwise, we rely on dead code elimination for removing it.
 117
 118         For example, say stmt S1 was vectorized into stmt VS1:
 119
 120    VS1: vb = px[i];
 121    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 122    S2:  a = b;
 123
 124    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 125    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 126    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 127    resulting sequence would be:
 128
 129    VS1: vb = px[i];
 130    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 131    VS2: va = vb;
 132    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 133
 134         Operands that are not SSA_NAMEs, are data-refs that appear in
 135    load/store operations (like 'x[i]' in S1), and are handled differently.
 136
 137    Target modeling:
 138    =================
 139         Currently the only target specific information that is used is the
 140    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 141    Targets that can support different sizes of vectors, for now will need
 142    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 143    flexibility will be added in the future.
 144
 145         Since we only vectorize operations which vector form can be
 146    expressed using existing tree codes, to verify that an operation is
 147    supported, the vectorizer checks the relevant optab at the relevant
 148    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 149    the value found is CODE_FOR_nothing, then there's no target support, and
 150    we can't vectorize the stmt.
 151
 152    For additional information on this project see:
 153    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 154 */
 155
 156 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 157 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
 158                                                bool *, bool *);
 159
 160 /* Subroutine of vect_determine_vf_for_stmt that handles only one
 161    statement.  VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
 162    may already be set for general statements (not just data refs).  */
 163
 164 static opt_result
 165 vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
 166                               bool vectype_maybe_set_p,
 167                               poly_uint64 *vf)
 168 {
 169   gimple *stmt = stmt_info->stmt;
 170
 171   if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 172        && !STMT_VINFO_LIVE_P (stmt_info))
 173       || gimple_clobber_p (stmt))
 174     {
 175       if (dump_enabled_p ())
 176         dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 177       return opt_result::success ();
 178     }
 179
 180   tree stmt_vectype, nunits_vectype;
 181   opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info,
 182                                                    &stmt_vectype,
 183                                                    &nunits_vectype);
 184   if (!res)
 185     return res;
 186
 187   if (stmt_vectype)
 188     {
 189       if (STMT_VINFO_VECTYPE (stmt_info))
 190         /* The only case when a vectype had been already set is for stmts
 191            that contain a data ref, or for "pattern-stmts" (stmts generated
 192            by the vectorizer to represent/replace a certain idiom).  */
 193         gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
 194                      || vectype_maybe_set_p)
 195                     && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
 196       else
 197         STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
 198     }
 199
 200   if (nunits_vectype)
 201     vect_update_max_nunits (vf, nunits_vectype);
 202
 203   return opt_result::success ();
 204 }
 205
 206 /* Subroutine of vect_determine_vectorization_factor.  Set the vector
 207    types of STMT_INFO and all attached pattern statements and update
 208    the vectorization factor VF accordingly.  Return true on success
 209    or false if something prevented vectorization.  */
 210
 211 static opt_result
 212 vect_determine_vf_for_stmt (vec_info *vinfo,
 213                             stmt_vec_info stmt_info, poly_uint64 *vf)
 214 {
 215   if (dump_enabled_p ())
 216     dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
 217                      stmt_info->stmt);
 218   opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, false, vf);
 219   if (!res)
 220     return res;
 221
 222   if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 223       && STMT_VINFO_RELATED_STMT (stmt_info))
 224     {
 225       gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 226       stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
 227
 228       /* If a pattern statement has def stmts, analyze them too.  */
 229       for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
 230            !gsi_end_p (si); gsi_next (&si))
 231         {
 232           stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
 233           if (dump_enabled_p ())
 234             dump_printf_loc (MSG_NOTE, vect_location,
 235                              "==> examining pattern def stmt: %G",
 236                              def_stmt_info->stmt);
 237           res = vect_determine_vf_for_stmt_1 (vinfo, def_stmt_info, true, vf);
 238           if (!res)
 239             return res;
 240         }
 241
 242       if (dump_enabled_p ())
 243         dump_printf_loc (MSG_NOTE, vect_location,
 244                          "==> examining pattern statement: %G",
 245                          stmt_info->stmt);
 246       res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, true, vf);
 247       if (!res)
 248         return res;
 249     }
 250
 251   return opt_result::success ();
 252 }
 253
 254 /* Function vect_determine_vectorization_factor
 255
 256    Determine the vectorization factor (VF).  VF is the number of data elements
 257    that are operated upon in parallel in a single iteration of the vectorized
 258    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 259    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 260    elements can fit in a single vector register.
 261
 262    We currently support vectorization of loops in which all types operated upon
 263    are of the same size.  Therefore this function currently sets VF according to
 264    the size of the types operated upon, and fails if there are multiple sizes
 265    in the loop.
 266
 267    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 268    original loop:
 269         for (i=0; i<N; i++){
 270           a[i] = b[i] + c[i];
 271         }
 272
 273    vectorized loop:
 274         for (i=0; i<N; i+=VF){
 275           a[i:VF] = b[i:VF] + c[i:VF];
 276         }
 277 */
 278
 279 static opt_result
 280 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 281 {
 282   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 283   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 284   unsigned nbbs = loop->num_nodes;
 285   poly_uint64 vectorization_factor = 1;
 286   tree scalar_type = NULL_TREE;
 287   gphi *phi;
 288   tree vectype;
 289   stmt_vec_info stmt_info;
 290   unsigned i;
 291
 292   DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
 293
 294   for (i = 0; i < nbbs; i++)
 295     {
 296       basic_block bb = bbs[i];
 297
 298       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 299            gsi_next (&si))
 300         {
 301           phi = si.phi ();
 302           stmt_info = loop_vinfo->lookup_stmt (phi);
 303           if (dump_enabled_p ())
 304             dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
 305                              phi);
 306
 307           gcc_assert (stmt_info);
 308
 309           if (STMT_VINFO_RELEVANT_P (stmt_info)
 310               || STMT_VINFO_LIVE_P (stmt_info))
 311             {
 312               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 313               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 314
 315               if (dump_enabled_p ())
 316                 dump_printf_loc (MSG_NOTE, vect_location,
 317                                  "get vectype for scalar type:  %T\n",
 318                                  scalar_type);
 319
 320               vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
 321               if (!vectype)
 322                 return opt_result::failure_at (phi,
 323                                                "not vectorized: unsupported "
 324                                                "data-type %T\n",
 325                                                scalar_type);
 326               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 327
 328               if (dump_enabled_p ())
 329                 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
 330                                  vectype);
 331
 332               if (dump_enabled_p ())
 333                 {
 334                   dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
 335                   dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
 336                   dump_printf (MSG_NOTE, "\n");
 337                 }
 338
 339               vect_update_max_nunits (&vectorization_factor, vectype);
 340             }
 341         }
 342
 343       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
 344            gsi_next (&si))
 345         {
 346           if (is_gimple_debug (gsi_stmt (si)))
 347             continue;
 348           stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
 349           opt_result res
 350             = vect_determine_vf_for_stmt (loop_vinfo,
 351                                           stmt_info, &vectorization_factor);
 352           if (!res)
 353             return res;
 354         }
 355     }
 356
 357   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 358   if (dump_enabled_p ())
 359     {
 360       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
 361       dump_dec (MSG_NOTE, vectorization_factor);
 362       dump_printf (MSG_NOTE, "\n");
 363     }
 364
 365   if (known_le (vectorization_factor, 1U))
 366     return opt_result::failure_at (vect_location,
 367                                    "not vectorized: unsupported data-type\n");
 368   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 369   return opt_result::success ();
 370 }
 371
 372
 373 /* Function vect_is_simple_iv_evolution.
 374
 375    FORNOW: A simple evolution of an induction variables in the loop is
 376    considered a polynomial evolution.  */
 377
 378 static bool
 379 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 380                              tree * step)
 381 {
 382   tree init_expr;
 383   tree step_expr;
 384   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 385   basic_block bb;
 386
 387   /* When there is no evolution in this loop, the evolution function
 388      is not "simple".  */
 389   if (evolution_part == NULL_TREE)
 390     return false;
 391
 392   /* When the evolution is a polynomial of degree >= 2
 393      the evolution function is not "simple".  */
 394   if (tree_is_chrec (evolution_part))
 395     return false;
 396
 397   step_expr = evolution_part;
 398   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 399
 400   if (dump_enabled_p ())
 401     dump_printf_loc (MSG_NOTE, vect_location, "step: %T,  init: %T\n",
 402                      step_expr, init_expr);
 403
 404   *init = init_expr;
 405   *step = step_expr;
 406
 407   if (TREE_CODE (step_expr) != INTEGER_CST
 408       && (TREE_CODE (step_expr) != SSA_NAME
 409           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 410               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 411           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 412               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 413                   || !flag_associative_math)))
 414       && (TREE_CODE (step_expr) != REAL_CST
 415           || !flag_associative_math))
 416     {
 417       if (dump_enabled_p ())
 418         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 419                          "step unknown.\n");
 420       return false;
 421     }
 422
 423   return true;
 424 }
 425
 426 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
 427    what we are assuming is a double reduction.  For example, given
 428    a structure like this:
 429
 430       outer1:
 431         x_1 = PHI <x_4(outer2), ...>;
 432         ...
 433
 434       inner:
 435         x_2 = PHI <x_1(outer1), ...>;
 436         ...
 437         x_3 = ...;
 438         ...
 439
 440       outer2:
 441         x_4 = PHI <x_3(inner)>;
 442         ...
 443
 444    outer loop analysis would treat x_1 as a double reduction phi and
 445    this function would then return true for x_2.  */
 446
 447 static bool
 448 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
 449 {
 450   use_operand_p use_p;
 451   ssa_op_iter op_iter;
 452   FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
 453     if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
 454       if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
 455         return true;
 456   return false;
 457 }
 458
 459 /* Function vect_analyze_scalar_cycles_1.
 460
 461    Examine the cross iteration def-use cycles of scalar variables
 462    in LOOP.  LOOP_VINFO represents the loop that is now being
 463    considered for vectorization (can be LOOP, or an outer-loop
 464    enclosing LOOP).  */
 465
 466 static void
 467 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop)
 468 {
 469   basic_block bb = loop->header;
 470   tree init, step;
 471   auto_vec<stmt_vec_info, 64> worklist;
 472   gphi_iterator gsi;
 473   bool double_reduc, reduc_chain;
 474
 475   DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
 476
 477   /* First - identify all inductions.  Reduction detection assumes that all the
 478      inductions have been identified, therefore, this order must not be
 479      changed.  */
 480   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 481     {
 482       gphi *phi = gsi.phi ();
 483       tree access_fn = NULL;
 484       tree def = PHI_RESULT (phi);
 485       stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
 486
 487       if (dump_enabled_p ())
 488         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
 489
 490       /* Skip virtual phi's.  The data dependences that are associated with
 491          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 492       if (virtual_operand_p (def))
 493         continue;
 494
 495       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 496
 497       /* Analyze the evolution function.  */
 498       access_fn = analyze_scalar_evolution (loop, def);
 499       if (access_fn)
 500         {
 501           STRIP_NOPS (access_fn);
 502           if (dump_enabled_p ())
 503             dump_printf_loc (MSG_NOTE, vect_location,
 504                              "Access function of PHI: %T\n", access_fn);
 505           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 506             = initial_condition_in_loop_num (access_fn, loop->num);
 507           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 508             = evolution_part_in_loop_num (access_fn, loop->num);
 509         }
 510
 511       if (!access_fn
 512           || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
 513           || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
 514           || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 515               && TREE_CODE (step) != INTEGER_CST))
 516         {
 517           worklist.safe_push (stmt_vinfo);
 518           continue;
 519         }
 520
 521       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 522                   != NULL_TREE);
 523       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 524
 525       if (dump_enabled_p ())
 526         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 527       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 528     }
 529
 530
 531   /* Second - identify all reductions and nested cycles.  */
 532   while (worklist.length () > 0)
 533     {
 534       stmt_vec_info stmt_vinfo = worklist.pop ();
 535       gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
 536       tree def = PHI_RESULT (phi);
 537
 538       if (dump_enabled_p ())
 539         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
 540
 541       gcc_assert (!virtual_operand_p (def)
 542                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 543
 544       stmt_vec_info reduc_stmt_info
 545         = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
 546                                     &reduc_chain);
 547       if (reduc_stmt_info)
 548         {
 549           STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
 550           STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
 551           if (double_reduc)
 552             {
 553               if (dump_enabled_p ())
 554                 dump_printf_loc (MSG_NOTE, vect_location,
 555                                  "Detected double reduction.\n");
 556
 557               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 558               STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
 559             }
 560           else
 561             {
 562               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
 563                 {
 564                   if (dump_enabled_p ())
 565                     dump_printf_loc (MSG_NOTE, vect_location,
 566                                      "Detected vectorizable nested cycle.\n");
 567
 568                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 569                 }
 570               else
 571                 {
 572                   if (dump_enabled_p ())
 573                     dump_printf_loc (MSG_NOTE, vect_location,
 574                                      "Detected reduction.\n");
 575
 576                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 577                   STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
 578                   /* Store the reduction cycles for possible vectorization in
 579                      loop-aware SLP if it was not detected as reduction
 580                      chain.  */
 581                   if (! reduc_chain)
 582                     LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
 583                       (reduc_stmt_info);
 584                 }
 585             }
 586         }
 587       else
 588         if (dump_enabled_p ())
 589           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 590                            "Unknown def-use cycle pattern.\n");
 591     }
 592 }
 593
 594
 595 /* Function vect_analyze_scalar_cycles.
 596
 597    Examine the cross iteration def-use cycles of scalar variables, by
 598    analyzing the loop-header PHIs of scalar variables.  Classify each
 599    cycle as one of the following: invariant, induction, reduction, unknown.
 600    We do that for the loop represented by LOOP_VINFO, and also to its
 601    inner-loop, if exists.
 602    Examples for scalar cycles:
 603
 604    Example1: reduction:
 605
 606               loop1:
 607               for (i=0; i<N; i++)
 608                  sum += a[i];
 609
 610    Example2: induction:
 611
 612               loop2:
 613               for (i=0; i<N; i++)
 614                  a[i] = i;  */
 615
 616 static void
 617 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 618 {
 619   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 620
 621   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 622
 623   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 624      Reductions in such inner-loop therefore have different properties than
 625      the reductions in the nest that gets vectorized:
 626      1. When vectorized, they are executed in the same order as in the original
 627         scalar loop, so we can't change the order of computation when
 628         vectorizing them.
 629      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 630         current checks are too strict.  */
 631
 632   if (loop->inner)
 633     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 634 }
 635
 636 /* Transfer group and reduction information from STMT_INFO to its
 637    pattern stmt.  */
 638
 639 static void
 640 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
 641 {
 642   stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
 643   stmt_vec_info stmtp;
 644   gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
 645               && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
 646   REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
 647   do
 648     {
 649       stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
 650       gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
 651                            == STMT_VINFO_DEF_TYPE (stmt_info));
 652       REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
 653       stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
 654       if (stmt_info)
 655         REDUC_GROUP_NEXT_ELEMENT (stmtp)
 656           = STMT_VINFO_RELATED_STMT (stmt_info);
 657     }
 658   while (stmt_info);
 659 }
 660
 661 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 662
 663 static void
 664 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 665 {
 666   stmt_vec_info first;
 667   unsigned i;
 668
 669   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 670     {
 671       stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
 672       while (next)
 673         {
 674           if ((STMT_VINFO_IN_PATTERN_P (next)
 675                != STMT_VINFO_IN_PATTERN_P (first))
 676               || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next)) == -1)
 677             break;
 678           next = REDUC_GROUP_NEXT_ELEMENT (next);
 679         }
 680       /* If all reduction chain members are well-formed patterns adjust
 681          the group to group the pattern stmts instead.  */
 682       if (! next
 683           && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first)) != -1)
 684         {
 685           if (STMT_VINFO_IN_PATTERN_P (first))
 686             {
 687               vect_fixup_reduc_chain (first);
 688               LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
 689                 = STMT_VINFO_RELATED_STMT (first);
 690             }
 691         }
 692       /* If not all stmt in the chain are patterns or if we failed
 693          to update STMT_VINFO_REDUC_IDX dissolve the chain and handle
 694          it as regular reduction instead.  */
 695       else
 696         {
 697           stmt_vec_info vinfo = first;
 698           stmt_vec_info last = NULL;
 699           while (vinfo)
 700             {
 701               next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
 702               REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
 703               REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
 704               last = vinfo;
 705               vinfo = next;
 706             }
 707           STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first))
 708             = vect_internal_def;
 709           loop_vinfo->reductions.safe_push (vect_stmt_to_vectorize (last));
 710           LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).unordered_remove (i);
 711           --i;
 712         }
 713     }
 714 }
 715
 716 /* Function vect_get_loop_niters.
 717
 718    Determine how many iterations the loop is executed and place it
 719    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
 720    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
 721    niter information holds in ASSUMPTIONS.
 722
 723    Return the loop exit condition.  */
 724
 725
 726 static gcond *
 727 vect_get_loop_niters (class loop *loop, tree *assumptions,
 728                       tree *number_of_iterations, tree *number_of_iterationsm1)
 729 {
 730   edge exit = single_exit (loop);
 731   class tree_niter_desc niter_desc;
 732   tree niter_assumptions, niter, may_be_zero;
 733   gcond *cond = get_loop_exit_condition (loop);
 734
 735   *assumptions = boolean_true_node;
 736   *number_of_iterationsm1 = chrec_dont_know;
 737   *number_of_iterations = chrec_dont_know;
 738   DUMP_VECT_SCOPE ("get_loop_niters");
 739
 740   if (!exit)
 741     return cond;
 742
 743   may_be_zero = NULL_TREE;
 744   if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
 745       || chrec_contains_undetermined (niter_desc.niter))
 746     return cond;
 747
 748   niter_assumptions = niter_desc.assumptions;
 749   may_be_zero = niter_desc.may_be_zero;
 750   niter = niter_desc.niter;
 751
 752   if (may_be_zero && integer_zerop (may_be_zero))
 753     may_be_zero = NULL_TREE;
 754
 755   if (may_be_zero)
 756     {
 757       if (COMPARISON_CLASS_P (may_be_zero))
 758         {
 759           /* Try to combine may_be_zero with assumptions, this can simplify
 760              computation of niter expression.  */
 761           if (niter_assumptions && !integer_nonzerop (niter_assumptions))
 762             niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
 763                                              niter_assumptions,
 764                                              fold_build1 (TRUTH_NOT_EXPR,
 765                                                           boolean_type_node,
 766                                                           may_be_zero));
 767           else
 768             niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
 769                                  build_int_cst (TREE_TYPE (niter), 0),
 770                                  rewrite_to_non_trapping_overflow (niter));
 771
 772           may_be_zero = NULL_TREE;
 773         }
 774       else if (integer_nonzerop (may_be_zero))
 775         {
 776           *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
 777           *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
 778           return cond;
 779         }
 780       else
 781         return cond;
 782     }
 783
 784   *assumptions = niter_assumptions;
 785   *number_of_iterationsm1 = niter;
 786
 787   /* We want the number of loop header executions which is the number
 788      of latch executions plus one.
 789      ???  For UINT_MAX latch executions this number overflows to zero
 790      for loops like do { n++; } while (n != 0);  */
 791   if (niter && !chrec_contains_undetermined (niter))
 792     niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
 793                           build_int_cst (TREE_TYPE (niter), 1));
 794   *number_of_iterations = niter;
 795
 796   return cond;
 797 }
 798
 799 /* Function bb_in_loop_p
 800
 801    Used as predicate for dfs order traversal of the loop bbs.  */
 802
 803 static bool
 804 bb_in_loop_p (const_basic_block bb, const void *data)
 805 {
 806   const class loop *const loop = (const class loop *)data;
 807   if (flow_bb_inside_loop_p (loop, bb))
 808     return true;
 809   return false;
 810 }
 811
 812
 813 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
 814    stmt_vec_info structs for all the stmts in LOOP_IN.  */
 815
 816 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
 817   : vec_info (vec_info::loop, init_cost (loop_in, false), shared),
 818     loop (loop_in),
 819     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
 820     num_itersm1 (NULL_TREE),
 821     num_iters (NULL_TREE),
 822     num_iters_unchanged (NULL_TREE),
 823     num_iters_assumptions (NULL_TREE),
 824     th (0),
 825     versioning_threshold (0),
 826     vectorization_factor (0),
 827     main_loop_edge (nullptr),
 828     skip_main_loop_edge (nullptr),
 829     skip_this_loop_edge (nullptr),
 830     reusable_accumulators (),
 831     max_vectorization_factor (0),
 832     mask_skip_niters (NULL_TREE),
 833     rgroup_compare_type (NULL_TREE),
 834     simd_if_cond (NULL_TREE),
 835     unaligned_dr (NULL),
 836     peeling_for_alignment (0),
 837     ptr_mask (0),
 838     ivexpr_map (NULL),
 839     scan_map (NULL),
 840     slp_unrolling_factor (1),
 841     single_scalar_iteration_cost (0),
 842     vec_outside_cost (0),
 843     vec_inside_cost (0),
 844     inner_loop_cost_factor (param_vect_inner_loop_cost_factor),
 845     vectorizable (false),
 846     can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
 847     using_partial_vectors_p (false),
 848     epil_using_partial_vectors_p (false),
 849     peeling_for_gaps (false),
 850     peeling_for_niter (false),
 851     no_data_dependencies (false),
 852     has_mask_store (false),
 853     scalar_loop_scaling (profile_probability::uninitialized ()),
 854     scalar_loop (NULL),
 855     orig_loop_info (NULL)
 856 {
 857   /* CHECKME: We want to visit all BBs before their successors (except for
 858      latch blocks, for which this assertion wouldn't hold).  In the simple
 859      case of the loop forms we allow, a dfs order of the BBs would the same
 860      as reversed postorder traversal, so we are safe.  */
 861
 862   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 863                                           bbs, loop->num_nodes, loop);
 864   gcc_assert (nbbs == loop->num_nodes);
 865
 866   for (unsigned int i = 0; i < nbbs; i++)
 867     {
 868       basic_block bb = bbs[i];
 869       gimple_stmt_iterator si;
 870
 871       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 872         {
 873           gimple *phi = gsi_stmt (si);
 874           gimple_set_uid (phi, 0);
 875           add_stmt (phi);
 876         }
 877
 878       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 879         {
 880           gimple *stmt = gsi_stmt (si);
 881           gimple_set_uid (stmt, 0);
 882           if (is_gimple_debug (stmt))
 883             continue;
 884           add_stmt (stmt);
 885           /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
 886              third argument is the #pragma omp simd if (x) condition, when 0,
 887              loop shouldn't be vectorized, when non-zero constant, it should
 888              be vectorized normally, otherwise versioned with vectorized loop
 889              done if the condition is non-zero at runtime.  */
 890           if (loop_in->simduid
 891               && is_gimple_call (stmt)
 892               && gimple_call_internal_p (stmt)
 893               && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
 894               && gimple_call_num_args (stmt) >= 3
 895               && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
 896               && (loop_in->simduid
 897                   == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
 898             {
 899               tree arg = gimple_call_arg (stmt, 2);
 900               if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
 901                 simd_if_cond = arg;
 902               else
 903                 gcc_assert (integer_nonzerop (arg));
 904             }
 905         }
 906     }
 907
 908   epilogue_vinfos.create (6);
 909 }
 910
 911 /* Free all levels of rgroup CONTROLS.  */
 912
 913 void
 914 release_vec_loop_controls (vec<rgroup_controls> *controls)
 915 {
 916   rgroup_controls *rgc;
 917   unsigned int i;
 918   FOR_EACH_VEC_ELT (*controls, i, rgc)
 919     rgc->controls.release ();
 920   controls->release ();
 921 }
 922
 923 /* Free all memory used by the _loop_vec_info, as well as all the
 924    stmt_vec_info structs of all the stmts in the loop.  */
 925
 926 _loop_vec_info::~_loop_vec_info ()
 927 {
 928   free (bbs);
 929
 930   release_vec_loop_controls (&masks);
 931   release_vec_loop_controls (&lens);
 932   delete ivexpr_map;
 933   delete scan_map;
 934   epilogue_vinfos.release ();
 935
 936   /* When we release an epiloge vinfo that we do not intend to use
 937      avoid clearing AUX of the main loop which should continue to
 938      point to the main loop vinfo since otherwise we'll leak that.  */
 939   if (loop->aux == this)
 940     loop->aux = NULL;
 941 }
 942
 943 /* Return an invariant or register for EXPR and emit necessary
 944    computations in the LOOP_VINFO loop preheader.  */
 945
 946 tree
 947 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
 948 {
 949   if (is_gimple_reg (expr)
 950       || is_gimple_min_invariant (expr))
 951     return expr;
 952
 953   if (! loop_vinfo->ivexpr_map)
 954     loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
 955   tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
 956   if (! cached)
 957     {
 958       gimple_seq stmts = NULL;
 959       cached = force_gimple_operand (unshare_expr (expr),
 960                                      &stmts, true, NULL_TREE);
 961       if (stmts)
 962         {
 963           edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
 964           gsi_insert_seq_on_edge_immediate (e, stmts);
 965         }
 966     }
 967   return cached;
 968 }
 969
 970 /* Return true if we can use CMP_TYPE as the comparison type to produce
 971    all masks required to mask LOOP_VINFO.  */
 972
 973 static bool
 974 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
 975 {
 976   rgroup_controls *rgm;
 977   unsigned int i;
 978   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
 979     if (rgm->type != NULL_TREE
 980         && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
 981                                             cmp_type, rgm->type,
 982                                             OPTIMIZE_FOR_SPEED))
 983       return false;
 984   return true;
 985 }
 986
 987 /* Calculate the maximum number of scalars per iteration for every
 988    rgroup in LOOP_VINFO.  */
 989
 990 static unsigned int
 991 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
 992 {
 993   unsigned int res = 1;
 994   unsigned int i;
 995   rgroup_controls *rgm;
 996   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
 997     res = MAX (res, rgm->max_nscalars_per_iter);
 998   return res;
 999 }
1000
1001 /* Calculate the minimum precision necessary to represent:
1002
1003       MAX_NITERS * FACTOR
1004
1005    as an unsigned integer, where MAX_NITERS is the maximum number of
1006    loop header iterations for the original scalar form of LOOP_VINFO.  */
1007
1008 static unsigned
1009 vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
1010 {
1011   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1012
1013   /* Get the maximum number of iterations that is representable
1014      in the counter type.  */
1015   tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1016   widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1017
1018   /* Get a more refined estimate for the number of iterations.  */
1019   widest_int max_back_edges;
1020   if (max_loop_iterations (loop, &max_back_edges))
1021     max_ni = wi::smin (max_ni, max_back_edges + 1);
1022
1023   /* Work out how many bits we need to represent the limit.  */
1024   return wi::min_precision (max_ni * factor, UNSIGNED);
1025 }
1026
1027 /* True if the loop needs peeling or partial vectors when vectorized.  */
1028
1029 static bool
1030 vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
1031 {
1032   unsigned HOST_WIDE_INT const_vf;
1033   HOST_WIDE_INT max_niter
1034     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1035
1036   unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1037   if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1038     th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1039                                           (loop_vinfo));
1040
1041   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1042       && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1043     {
1044       /* Work out the (constant) number of iterations that need to be
1045          peeled for reasons other than niters.  */
1046       unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1047       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1048         peel_niter += 1;
1049       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1050                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1051         return true;
1052     }
1053   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1054       /* ??? When peeling for gaps but not alignment, we could
1055          try to check whether the (variable) niters is known to be
1056          VF * N + 1.  That's something of a niche case though.  */
1057       || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1058       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1059       || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1060            < (unsigned) exact_log2 (const_vf))
1061           /* In case of versioning, check if the maximum number of
1062              iterations is greater than th.  If they are identical,
1063              the epilogue is unnecessary.  */
1064           && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1065               || ((unsigned HOST_WIDE_INT) max_niter
1066                   > (th / const_vf) * const_vf))))
1067     return true;
1068
1069   return false;
1070 }
1071
1072 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1073    whether we can actually generate the masks required.  Return true if so,
1074    storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE.  */
1075
1076 static bool
1077 vect_verify_full_masking (loop_vec_info loop_vinfo)
1078 {
1079   unsigned int min_ni_width;
1080   unsigned int max_nscalars_per_iter
1081     = vect_get_max_nscalars_per_iter (loop_vinfo);
1082
1083   /* Use a normal loop if there are no statements that need masking.
1084      This only happens in rare degenerate cases: it means that the loop
1085      has no loads, no stores, and no live-out values.  */
1086   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1087     return false;
1088
1089   /* Work out how many bits we need to represent the limit.  */
1090   min_ni_width
1091     = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1092
1093   /* Find a scalar mode for which WHILE_ULT is supported.  */
1094   opt_scalar_int_mode cmp_mode_iter;
1095   tree cmp_type = NULL_TREE;
1096   tree iv_type = NULL_TREE;
1097   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1098   unsigned int iv_precision = UINT_MAX;
1099
1100   if (iv_limit != -1)
1101     iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1102                                       UNSIGNED);
1103
1104   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1105     {
1106       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1107       if (cmp_bits >= min_ni_width
1108           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1109         {
1110           tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1111           if (this_type
1112               && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1113             {
1114               /* Although we could stop as soon as we find a valid mode,
1115                  there are at least two reasons why that's not always the
1116                  best choice:
1117
1118                  - An IV that's Pmode or wider is more likely to be reusable
1119                    in address calculations than an IV that's narrower than
1120                    Pmode.
1121
1122                  - Doing the comparison in IV_PRECISION or wider allows
1123                    a natural 0-based IV, whereas using a narrower comparison
1124                    type requires mitigations against wrap-around.
1125
1126                  Conversely, if the IV limit is variable, doing the comparison
1127                  in a wider type than the original type can introduce
1128                  unnecessary extensions, so picking the widest valid mode
1129                  is not always a good choice either.
1130
1131                  Here we prefer the first IV type that's Pmode or wider,
1132                  and the first comparison type that's IV_PRECISION or wider.
1133                  (The comparison type must be no wider than the IV type,
1134                  to avoid extensions in the vector loop.)
1135
1136                  ??? We might want to try continuing beyond Pmode for ILP32
1137                  targets if CMP_BITS < IV_PRECISION.  */
1138               iv_type = this_type;
1139               if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1140                 cmp_type = this_type;
1141               if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1142                 break;
1143             }
1144         }
1145     }
1146
1147   if (!cmp_type)
1148     return false;
1149
1150   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1151   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1152   return true;
1153 }
1154
1155 /* Check whether we can use vector access with length based on precison
1156    comparison.  So far, to keep it simple, we only allow the case that the
1157    precision of the target supported length is larger than the precision
1158    required by loop niters.  */
1159
1160 static bool
1161 vect_verify_loop_lens (loop_vec_info loop_vinfo)
1162 {
1163   if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1164     return false;
1165
1166   unsigned int max_nitems_per_iter = 1;
1167   unsigned int i;
1168   rgroup_controls *rgl;
1169   /* Find the maximum number of items per iteration for every rgroup.  */
1170   FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1171     {
1172       unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1173       max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1174     }
1175
1176   /* Work out how many bits we need to represent the length limit.  */
1177   unsigned int min_ni_prec
1178     = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1179
1180   /* Now use the maximum of below precisions for one suitable IV type:
1181      - the IV's natural precision
1182      - the precision needed to hold: the maximum number of scalar
1183        iterations multiplied by the scale factor (min_ni_prec above)
1184      - the Pmode precision
1185
1186      If min_ni_prec is less than the precision of the current niters,
1187      we perfer to still use the niters type.  Prefer to use Pmode and
1188      wider IV to avoid narrow conversions.  */
1189
1190   unsigned int ni_prec
1191     = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1192   min_ni_prec = MAX (min_ni_prec, ni_prec);
1193   min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1194
1195   tree iv_type = NULL_TREE;
1196   opt_scalar_int_mode tmode_iter;
1197   FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1198     {
1199       scalar_mode tmode = tmode_iter.require ();
1200       unsigned int tbits = GET_MODE_BITSIZE (tmode);
1201
1202       /* ??? Do we really want to construct one IV whose precision exceeds
1203          BITS_PER_WORD?  */
1204       if (tbits > BITS_PER_WORD)
1205         break;
1206
1207       /* Find the first available standard integral type.  */
1208       if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1209         {
1210           iv_type = build_nonstandard_integer_type (tbits, true);
1211           break;
1212         }
1213     }
1214
1215   if (!iv_type)
1216     {
1217       if (dump_enabled_p ())
1218         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1219                          "can't vectorize with length-based partial vectors"
1220                          " because there is no suitable iv type.\n");
1221       return false;
1222     }
1223
1224   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1225   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1226
1227   return true;
1228 }
1229
1230 /* Calculate the cost of one scalar iteration of the loop.  */
1231 static void
1232 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1233 {
1234   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1235   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1236   int nbbs = loop->num_nodes, factor;
1237   int innerloop_iters, i;
1238
1239   DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1240
1241   /* Gather costs for statements in the scalar loop.  */
1242
1243   /* FORNOW.  */
1244   innerloop_iters = 1;
1245   if (loop->inner)
1246     innerloop_iters = LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo);
1247
1248   for (i = 0; i < nbbs; i++)
1249     {
1250       gimple_stmt_iterator si;
1251       basic_block bb = bbs[i];
1252
1253       if (bb->loop_father == loop->inner)
1254         factor = innerloop_iters;
1255       else
1256         factor = 1;
1257
1258       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1259         {
1260           gimple *stmt = gsi_stmt (si);
1261           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1262
1263           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1264             continue;
1265
1266           /* Skip stmts that are not vectorized inside the loop.  */
1267           stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1268           if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1269               && (!STMT_VINFO_LIVE_P (vstmt_info)
1270                   || !VECTORIZABLE_CYCLE_DEF
1271                         (STMT_VINFO_DEF_TYPE (vstmt_info))))
1272             continue;
1273
1274           vect_cost_for_stmt kind;
1275           if (STMT_VINFO_DATA_REF (stmt_info))
1276             {
1277               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1278                kind = scalar_load;
1279              else
1280                kind = scalar_store;
1281             }
1282           else if (vect_nop_conversion_p (stmt_info))
1283             continue;
1284           else
1285             kind = scalar_stmt;
1286
1287           record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1288                             factor, kind, stmt_info, 0, vect_prologue);
1289         }
1290     }
1291
1292   /* Now accumulate cost.  */
1293   void *target_cost_data = init_cost (loop, true);
1294   stmt_info_for_cost *si;
1295   int j;
1296   FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1297                     j, si)
1298     (void) add_stmt_cost (loop_vinfo, target_cost_data, si->count,
1299                           si->kind, si->stmt_info, si->vectype,
1300                           si->misalign, vect_body);
1301   unsigned dummy, body_cost = 0;
1302   finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
1303   destroy_cost_data (target_cost_data);
1304   LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
1305 }
1306
1307
1308 /* Function vect_analyze_loop_form_1.
1309
1310    Verify that certain CFG restrictions hold, including:
1311    - the loop has a pre-header
1312    - the loop has a single entry and exit
1313    - the loop exit condition is simple enough
1314    - the number of iterations can be analyzed, i.e, a countable loop.  The
1315      niter could be analyzed under some assumptions.  */
1316
1317 opt_result
1318 vect_analyze_loop_form_1 (class loop *loop, gcond **loop_cond,
1319                           tree *assumptions, tree *number_of_iterationsm1,
1320                           tree *number_of_iterations, gcond **inner_loop_cond)
1321 {
1322   DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1323
1324   /* Different restrictions apply when we are considering an inner-most loop,
1325      vs. an outer (nested) loop.
1326      (FORNOW. May want to relax some of these restrictions in the future).  */
1327
1328   if (!loop->inner)
1329     {
1330       /* Inner-most loop.  We currently require that the number of BBs is
1331          exactly 2 (the header and latch).  Vectorizable inner-most loops
1332          look like this:
1333
1334                         (pre-header)
1335                            |
1336                           header <--------+
1337                            | |            |
1338                            | +--> latch --+
1339                            |
1340                         (exit-bb)  */
1341
1342       if (loop->num_nodes != 2)
1343         return opt_result::failure_at (vect_location,
1344                                        "not vectorized:"
1345                                        " control flow in loop.\n");
1346
1347       if (empty_block_p (loop->header))
1348         return opt_result::failure_at (vect_location,
1349                                        "not vectorized: empty loop.\n");
1350     }
1351   else
1352     {
1353       class loop *innerloop = loop->inner;
1354       edge entryedge;
1355
1356       /* Nested loop. We currently require that the loop is doubly-nested,
1357          contains a single inner loop, and the number of BBs is exactly 5.
1358          Vectorizable outer-loops look like this:
1359
1360                         (pre-header)
1361                            |
1362                           header <---+
1363                            |         |
1364                           inner-loop |
1365                            |         |
1366                           tail ------+
1367                            |
1368                         (exit-bb)
1369
1370          The inner-loop has the properties expected of inner-most loops
1371          as described above.  */
1372
1373       if ((loop->inner)->inner || (loop->inner)->next)
1374         return opt_result::failure_at (vect_location,
1375                                        "not vectorized:"
1376                                        " multiple nested loops.\n");
1377
1378       if (loop->num_nodes != 5)
1379         return opt_result::failure_at (vect_location,
1380                                        "not vectorized:"
1381                                        " control flow in loop.\n");
1382
1383       entryedge = loop_preheader_edge (innerloop);
1384       if (entryedge->src != loop->header
1385           || !single_exit (innerloop)
1386           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1387         return opt_result::failure_at (vect_location,
1388                                        "not vectorized:"
1389                                        " unsupported outerloop form.\n");
1390
1391       /* Analyze the inner-loop.  */
1392       tree inner_niterm1, inner_niter, inner_assumptions;
1393       opt_result res
1394         = vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1395                                     &inner_assumptions, &inner_niterm1,
1396                                     &inner_niter, NULL);
1397       if (!res)
1398         {
1399           if (dump_enabled_p ())
1400             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1401                              "not vectorized: Bad inner loop.\n");
1402           return res;
1403         }
1404
1405       /* Don't support analyzing niter under assumptions for inner
1406          loop.  */
1407       if (!integer_onep (inner_assumptions))
1408         return opt_result::failure_at (vect_location,
1409                                        "not vectorized: Bad inner loop.\n");
1410
1411       if (!expr_invariant_in_loop_p (loop, inner_niter))
1412         return opt_result::failure_at (vect_location,
1413                                        "not vectorized: inner-loop count not"
1414                                        " invariant.\n");
1415
1416       if (dump_enabled_p ())
1417         dump_printf_loc (MSG_NOTE, vect_location,
1418                          "Considering outer-loop vectorization.\n");
1419     }
1420
1421   if (!single_exit (loop))
1422     return opt_result::failure_at (vect_location,
1423                                    "not vectorized: multiple exits.\n");
1424   if (EDGE_COUNT (loop->header->preds) != 2)
1425     return opt_result::failure_at (vect_location,
1426                                    "not vectorized:"
1427                                    " too many incoming edges.\n");
1428
1429   /* We assume that the loop exit condition is at the end of the loop. i.e,
1430      that the loop is represented as a do-while (with a proper if-guard
1431      before the loop if needed), where the loop header contains all the
1432      executable statements, and the latch is empty.  */
1433   if (!empty_block_p (loop->latch)
1434       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1435     return opt_result::failure_at (vect_location,
1436                                    "not vectorized: latch block not empty.\n");
1437
1438   /* Make sure the exit is not abnormal.  */
1439   edge e = single_exit (loop);
1440   if (e->flags & EDGE_ABNORMAL)
1441     return opt_result::failure_at (vect_location,
1442                                    "not vectorized:"
1443                                    " abnormal loop exit edge.\n");
1444
1445   *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1446                                      number_of_iterationsm1);
1447   if (!*loop_cond)
1448     return opt_result::failure_at
1449       (vect_location,
1450        "not vectorized: complicated exit condition.\n");
1451
1452   if (integer_zerop (*assumptions)
1453       || !*number_of_iterations
1454       || chrec_contains_undetermined (*number_of_iterations))
1455     return opt_result::failure_at
1456       (*loop_cond,
1457        "not vectorized: number of iterations cannot be computed.\n");
1458
1459   if (integer_zerop (*number_of_iterations))
1460     return opt_result::failure_at
1461       (*loop_cond,
1462        "not vectorized: number of iterations = 0.\n");
1463
1464   return opt_result::success ();
1465 }
1466
1467 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form.  */
1468
1469 opt_loop_vec_info
1470 vect_analyze_loop_form (class loop *loop, vec_info_shared *shared)
1471 {
1472   tree assumptions, number_of_iterations, number_of_iterationsm1;
1473   gcond *loop_cond, *inner_loop_cond = NULL;
1474
1475   opt_result res
1476     = vect_analyze_loop_form_1 (loop, &loop_cond,
1477                                 &assumptions, &number_of_iterationsm1,
1478                                 &number_of_iterations, &inner_loop_cond);
1479   if (!res)
1480     return opt_loop_vec_info::propagate_failure (res);
1481
1482   loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1483   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1484   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1485   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1486   if (!integer_onep (assumptions))
1487     {
1488       /* We consider to vectorize this loop by versioning it under
1489          some assumptions.  In order to do this, we need to clear
1490          existing information computed by scev and niter analyzer.  */
1491       scev_reset_htab ();
1492       free_numbers_of_iterations_estimates (loop);
1493       /* Also set flag for this loop so that following scev and niter
1494          analysis are done under the assumptions.  */
1495       loop_constraint_set (loop, LOOP_C_FINITE);
1496       /* Also record the assumptions for versioning.  */
1497       LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1498     }
1499
1500   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1501     {
1502       if (dump_enabled_p ())
1503         {
1504           dump_printf_loc (MSG_NOTE, vect_location,
1505                            "Symbolic number of iterations is ");
1506           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1507           dump_printf (MSG_NOTE, "\n");
1508         }
1509     }
1510
1511   stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (loop_cond);
1512   STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1513   if (inner_loop_cond)
1514     {
1515       stmt_vec_info inner_loop_cond_info
1516         = loop_vinfo->lookup_stmt (inner_loop_cond);
1517       STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1518     }
1519
1520   gcc_assert (!loop->aux);
1521   loop->aux = loop_vinfo;
1522   return opt_loop_vec_info::success (loop_vinfo);
1523 }
1524
1525
1526
1527 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1528    statements update the vectorization factor.  */
1529
1530 static void
1531 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1532 {
1533   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1534   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1535   int nbbs = loop->num_nodes;
1536   poly_uint64 vectorization_factor;
1537   int i;
1538
1539   DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1540
1541   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1542   gcc_assert (known_ne (vectorization_factor, 0U));
1543
1544   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1545      vectorization factor of the loop is the unrolling factor required by
1546      the SLP instances.  If that unrolling factor is 1, we say, that we
1547      perform pure SLP on loop - cross iteration parallelism is not
1548      exploited.  */
1549   bool only_slp_in_loop = true;
1550   for (i = 0; i < nbbs; i++)
1551     {
1552       basic_block bb = bbs[i];
1553       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1554            gsi_next (&si))
1555         {
1556           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1557           if (!stmt_info)
1558             continue;
1559           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1560                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1561               && !PURE_SLP_STMT (stmt_info))
1562             /* STMT needs both SLP and loop-based vectorization.  */
1563             only_slp_in_loop = false;
1564         }
1565       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1566            gsi_next (&si))
1567         {
1568           if (is_gimple_debug (gsi_stmt (si)))
1569             continue;
1570           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1571           stmt_info = vect_stmt_to_vectorize (stmt_info);
1572           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1573                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1574               && !PURE_SLP_STMT (stmt_info))
1575             /* STMT needs both SLP and loop-based vectorization.  */
1576             only_slp_in_loop = false;
1577         }
1578     }
1579
1580   if (only_slp_in_loop)
1581     {
1582       if (dump_enabled_p ())
1583         dump_printf_loc (MSG_NOTE, vect_location,
1584                          "Loop contains only SLP stmts\n");
1585       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1586     }
1587   else
1588     {
1589       if (dump_enabled_p ())
1590         dump_printf_loc (MSG_NOTE, vect_location,
1591                          "Loop contains SLP and non-SLP stmts\n");
1592       /* Both the vectorization factor and unroll factor have the form
1593          GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
1594          so they must have a common multiple.  */
1595       vectorization_factor
1596         = force_common_multiple (vectorization_factor,
1597                                  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1598     }
1599
1600   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1601   if (dump_enabled_p ())
1602     {
1603       dump_printf_loc (MSG_NOTE, vect_location,
1604                        "Updating vectorization factor to ");
1605       dump_dec (MSG_NOTE, vectorization_factor);
1606       dump_printf (MSG_NOTE, ".\n");
1607     }
1608 }
1609
1610 /* Return true if STMT_INFO describes a double reduction phi and if
1611    the other phi in the reduction is also relevant for vectorization.
1612    This rejects cases such as:
1613
1614       outer1:
1615         x_1 = PHI <x_3(outer2), ...>;
1616         ...
1617
1618       inner:
1619         x_2 = ...;
1620         ...
1621
1622       outer2:
1623         x_3 = PHI <x_2(inner)>;
1624
1625    if nothing in x_2 or elsewhere makes x_1 relevant.  */
1626
1627 static bool
1628 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1629 {
1630   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1631     return false;
1632
1633   return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1634 }
1635
1636 /* Function vect_analyze_loop_operations.
1637
1638    Scan the loop stmts and make sure they are all vectorizable.  */
1639
1640 static opt_result
1641 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1642 {
1643   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1644   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1645   int nbbs = loop->num_nodes;
1646   int i;
1647   stmt_vec_info stmt_info;
1648   bool need_to_vectorize = false;
1649   bool ok;
1650
1651   DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1652
1653   auto_vec<stmt_info_for_cost> cost_vec;
1654
1655   for (i = 0; i < nbbs; i++)
1656     {
1657       basic_block bb = bbs[i];
1658
1659       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1660            gsi_next (&si))
1661         {
1662           gphi *phi = si.phi ();
1663           ok = true;
1664
1665           stmt_info = loop_vinfo->lookup_stmt (phi);
1666           if (dump_enabled_p ())
1667             dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G", phi);
1668           if (virtual_operand_p (gimple_phi_result (phi)))
1669             continue;
1670
1671           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1672              (i.e., a phi in the tail of the outer-loop).  */
1673           if (! is_loop_header_bb_p (bb))
1674             {
1675               /* FORNOW: we currently don't support the case that these phis
1676                  are not used in the outerloop (unless it is double reduction,
1677                  i.e., this phi is vect_reduction_def), cause this case
1678                  requires to actually do something here.  */
1679               if (STMT_VINFO_LIVE_P (stmt_info)
1680                   && !vect_active_double_reduction_p (stmt_info))
1681                 return opt_result::failure_at (phi,
1682                                                "Unsupported loop-closed phi"
1683                                                " in outer-loop.\n");
1684
1685               /* If PHI is used in the outer loop, we check that its operand
1686                  is defined in the inner loop.  */
1687               if (STMT_VINFO_RELEVANT_P (stmt_info))
1688                 {
1689                   tree phi_op;
1690
1691                   if (gimple_phi_num_args (phi) != 1)
1692                     return opt_result::failure_at (phi, "unsupported phi");
1693
1694                   phi_op = PHI_ARG_DEF (phi, 0);
1695                   stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1696                   if (!op_def_info)
1697                     return opt_result::failure_at (phi, "unsupported phi\n");
1698
1699                   if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1700                       && (STMT_VINFO_RELEVANT (op_def_info)
1701                           != vect_used_in_outer_by_reduction))
1702                     return opt_result::failure_at (phi, "unsupported phi\n");
1703
1704                   if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
1705                        || (STMT_VINFO_DEF_TYPE (stmt_info)
1706                            == vect_double_reduction_def))
1707                       && !vectorizable_lc_phi (loop_vinfo,
1708                                                stmt_info, NULL, NULL))
1709                     return opt_result::failure_at (phi, "unsupported phi\n");
1710                 }
1711
1712               continue;
1713             }
1714
1715           gcc_assert (stmt_info);
1716
1717           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1718                || STMT_VINFO_LIVE_P (stmt_info))
1719               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1720             /* A scalar-dependence cycle that we don't support.  */
1721             return opt_result::failure_at (phi,
1722                                            "not vectorized:"
1723                                            " scalar dependence cycle.\n");
1724
1725           if (STMT_VINFO_RELEVANT_P (stmt_info))
1726             {
1727               need_to_vectorize = true;
1728               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1729                   && ! PURE_SLP_STMT (stmt_info))
1730                 ok = vectorizable_induction (loop_vinfo,
1731                                              stmt_info, NULL, NULL,
1732                                              &cost_vec);
1733               else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1734                         || (STMT_VINFO_DEF_TYPE (stmt_info)
1735                             == vect_double_reduction_def)
1736                         || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1737                        && ! PURE_SLP_STMT (stmt_info))
1738                 ok = vectorizable_reduction (loop_vinfo,
1739                                              stmt_info, NULL, NULL, &cost_vec);
1740             }
1741
1742           /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
1743           if (ok
1744               && STMT_VINFO_LIVE_P (stmt_info)
1745               && !PURE_SLP_STMT (stmt_info))
1746             ok = vectorizable_live_operation (loop_vinfo,
1747                                               stmt_info, NULL, NULL, NULL,
1748                                               -1, false, &cost_vec);
1749
1750           if (!ok)
1751             return opt_result::failure_at (phi,
1752                                            "not vectorized: relevant phi not "
1753                                            "supported: %G",
1754                                            static_cast <gimple *> (phi));
1755         }
1756
1757       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1758            gsi_next (&si))
1759         {
1760           gimple *stmt = gsi_stmt (si);
1761           if (!gimple_clobber_p (stmt)
1762               && !is_gimple_debug (stmt))
1763             {
1764               opt_result res
1765                 = vect_analyze_stmt (loop_vinfo,
1766                                      loop_vinfo->lookup_stmt (stmt),
1767                                      &need_to_vectorize,
1768                                      NULL, NULL, &cost_vec);
1769               if (!res)
1770                 return res;
1771             }
1772         }
1773     } /* bbs */
1774
1775   add_stmt_costs (loop_vinfo, loop_vinfo->target_cost_data, &cost_vec);
1776
1777   /* All operations in the loop are either irrelevant (deal with loop
1778      control, or dead), or only used outside the loop and can be moved
1779      out of the loop (e.g. invariants, inductions).  The loop can be
1780      optimized away by scalar optimizations.  We're better off not
1781      touching this loop.  */
1782   if (!need_to_vectorize)
1783     {
1784       if (dump_enabled_p ())
1785         dump_printf_loc (MSG_NOTE, vect_location,
1786                          "All the computation can be taken out of the loop.\n");
1787       return opt_result::failure_at
1788         (vect_location,
1789          "not vectorized: redundant loop. no profit to vectorize.\n");
1790     }
1791
1792   return opt_result::success ();
1793 }
1794
1795 /* Return true if we know that the iteration count is smaller than the
1796    vectorization factor.  Return false if it isn't, or if we can't be sure
1797    either way.  */
1798
1799 static bool
1800 vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
1801 {
1802   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1803
1804   HOST_WIDE_INT max_niter;
1805   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1806     max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1807   else
1808     max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1809
1810   if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1811     return true;
1812
1813   return false;
1814 }
1815
1816 /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
1817    is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
1818    definitely no, or -1 if it's worth retrying.  */
1819
1820 static int
1821 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1822 {
1823   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1824   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1825
1826   /* Only loops that can handle partially-populated vectors can have iteration
1827      counts less than the vectorization factor.  */
1828   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
1829     {
1830       if (vect_known_niters_smaller_than_vf (loop_vinfo))
1831         {
1832           if (dump_enabled_p ())
1833             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1834                              "not vectorized: iteration count smaller than "
1835                              "vectorization factor.\n");
1836           return 0;
1837         }
1838     }
1839
1840   /* If using the "very cheap" model. reject cases in which we'd keep
1841      a copy of the scalar code (even if we might be able to vectorize it).  */
1842   if (flag_vect_cost_model == VECT_COST_MODEL_VERY_CHEAP
1843       && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1844           || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1845           || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)))
1846     {
1847       if (dump_enabled_p ())
1848         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1849                          "some scalar iterations would need to be peeled\n");
1850       return 0;
1851     }
1852
1853   int min_profitable_iters, min_profitable_estimate;
1854   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1855                                       &min_profitable_estimate);
1856
1857   if (min_profitable_iters < 0)
1858     {
1859       if (dump_enabled_p ())
1860         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1861                          "not vectorized: vectorization not profitable.\n");
1862       if (dump_enabled_p ())
1863         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1864                          "not vectorized: vector version will never be "
1865                          "profitable.\n");
1866       return -1;
1867     }
1868
1869   int min_scalar_loop_bound = (param_min_vect_loop_bound
1870                                * assumed_vf);
1871
1872   /* Use the cost model only if it is more conservative than user specified
1873      threshold.  */
1874   unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1875                                     min_profitable_iters);
1876
1877   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1878
1879   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1880       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1881     {
1882       if (dump_enabled_p ())
1883         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1884                          "not vectorized: vectorization not profitable.\n");
1885       if (dump_enabled_p ())
1886         dump_printf_loc (MSG_NOTE, vect_location,
1887                          "not vectorized: iteration count smaller than user "
1888                          "specified loop bound parameter or minimum profitable "
1889                          "iterations (whichever is more conservative).\n");
1890       return 0;
1891     }
1892
1893   /* The static profitablity threshold min_profitable_estimate includes
1894      the cost of having to check at runtime whether the scalar loop
1895      should be used instead.  If it turns out that we don't need or want
1896      such a check, the threshold we should use for the static estimate
1897      is simply the point at which the vector loop becomes more profitable
1898      than the scalar loop.  */
1899   if (min_profitable_estimate > min_profitable_iters
1900       && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
1901       && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
1902       && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1903       && !vect_apply_runtime_profitability_check_p (loop_vinfo))
1904     {
1905       if (dump_enabled_p ())
1906         dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
1907                          " choice between the scalar and vector loops\n");
1908       min_profitable_estimate = min_profitable_iters;
1909     }
1910
1911   /* If the vector loop needs multiple iterations to be beneficial then
1912      things are probably too close to call, and the conservative thing
1913      would be to stick with the scalar code.  */
1914   if (flag_vect_cost_model == VECT_COST_MODEL_VERY_CHEAP
1915       && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
1916     {
1917       if (dump_enabled_p ())
1918         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1919                          "one iteration of the vector loop would be"
1920                          " more expensive than the equivalent number of"
1921                          " iterations of the scalar loop\n");
1922       return 0;
1923     }
1924
1925   HOST_WIDE_INT estimated_niter;
1926
1927   /* If we are vectorizing an epilogue then we know the maximum number of
1928      scalar iterations it will cover is at least one lower than the
1929      vectorization factor of the main loop.  */
1930   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1931     estimated_niter
1932       = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
1933   else
1934     {
1935       estimated_niter = estimated_stmt_executions_int (loop);
1936       if (estimated_niter == -1)
1937         estimated_niter = likely_max_stmt_executions_int (loop);
1938     }
1939   if (estimated_niter != -1
1940       && ((unsigned HOST_WIDE_INT) estimated_niter
1941           < MAX (th, (unsigned) min_profitable_estimate)))
1942     {
1943       if (dump_enabled_p ())
1944         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1945                          "not vectorized: estimated iteration count too "
1946                          "small.\n");
1947       if (dump_enabled_p ())
1948         dump_printf_loc (MSG_NOTE, vect_location,
1949                          "not vectorized: estimated iteration count smaller "
1950                          "than specified loop bound parameter or minimum "
1951                          "profitable iterations (whichever is more "
1952                          "conservative).\n");
1953       return -1;
1954     }
1955
1956   return 1;
1957 }
1958
1959 static opt_result
1960 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1961                            vec<data_reference_p> *datarefs,
1962                            unsigned int *n_stmts)
1963 {
1964   *n_stmts = 0;
1965   for (unsigned i = 0; i < loop->num_nodes; i++)
1966     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1967          !gsi_end_p (gsi); gsi_next (&gsi))
1968       {
1969         gimple *stmt = gsi_stmt (gsi);
1970         if (is_gimple_debug (stmt))
1971           continue;
1972         ++(*n_stmts);
1973         opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
1974                                                         NULL, 0);
1975         if (!res)
1976           {
1977             if (is_gimple_call (stmt) && loop->safelen)
1978               {
1979                 tree fndecl = gimple_call_fndecl (stmt), op;
1980                 if (fndecl != NULL_TREE)
1981                   {
1982                     cgraph_node *node = cgraph_node::get (fndecl);
1983                     if (node != NULL && node->simd_clones != NULL)
1984                       {
1985                         unsigned int j, n = gimple_call_num_args (stmt);
1986                         for (j = 0; j < n; j++)
1987                           {
1988                             op = gimple_call_arg (stmt, j);
1989                             if (DECL_P (op)
1990                                 || (REFERENCE_CLASS_P (op)
1991                                     && get_base_address (op)))
1992                               break;
1993                           }
1994                         op = gimple_call_lhs (stmt);
1995                         /* Ignore #pragma omp declare simd functions
1996                            if they don't have data references in the
1997                            call stmt itself.  */
1998                         if (j == n
1999                             && !(op
2000                                  && (DECL_P (op)
2001                                      || (REFERENCE_CLASS_P (op)
2002                                          && get_base_address (op)))))
2003                           continue;
2004                       }
2005                   }
2006               }
2007             return res;
2008           }
2009         /* If dependence analysis will give up due to the limit on the
2010            number of datarefs stop here and fail fatally.  */
2011         if (datarefs->length ()
2012             > (unsigned)param_loop_max_datarefs_for_datadeps)
2013           return opt_result::failure_at (stmt, "exceeded param "
2014                                          "loop-max-datarefs-for-datadeps\n");
2015       }
2016   return opt_result::success ();
2017 }
2018
2019 /* Look for SLP-only access groups and turn each individual access into its own
2020    group.  */
2021 static void
2022 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
2023 {
2024   unsigned int i;
2025   struct data_reference *dr;
2026
2027   DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
2028
2029   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2030   FOR_EACH_VEC_ELT (datarefs, i, dr)
2031     {
2032       gcc_assert (DR_REF (dr));
2033       stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
2034
2035       /* Check if the load is a part of an interleaving chain.  */
2036       if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2037         {
2038           stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
2039           unsigned int group_size = DR_GROUP_SIZE (first_element);
2040
2041           /* Check if SLP-only groups.  */
2042           if (!STMT_SLP_TYPE (stmt_info)
2043               && STMT_VINFO_SLP_VECT_ONLY (first_element))
2044             {
2045               /* Dissolve the group.  */
2046               STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
2047
2048               stmt_vec_info vinfo = first_element;
2049               while (vinfo)
2050                 {
2051                   stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
2052                   DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
2053                   DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2054                   DR_GROUP_SIZE (vinfo) = 1;
2055                   if (STMT_VINFO_STRIDED_P (first_element))
2056                     DR_GROUP_GAP (vinfo) = 0;
2057                   else
2058                     DR_GROUP_GAP (vinfo) = group_size - 1;
2059                   vinfo = next;
2060                 }
2061             }
2062         }
2063     }
2064 }
2065
2066 /* Determine if operating on full vectors for LOOP_VINFO might leave
2067    some scalar iterations still to do.  If so, decide how we should
2068    handle those scalar iterations.  The possibilities are:
2069
2070    (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
2071        In this case:
2072
2073          LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2074          LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2075          LOOP_VINFO_PEELING_FOR_NITER == false
2076
2077    (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2078        to handle the remaining scalar iterations.  In this case:
2079
2080          LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2081          LOOP_VINFO_PEELING_FOR_NITER == true
2082
2083        There are two choices:
2084
2085        (2a) Consider vectorizing the epilogue loop at the same VF as the
2086             main loop, but using partial vectors instead of full vectors.
2087             In this case:
2088
2089               LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true
2090
2091        (2b) Consider vectorizing the epilogue loop at lower VFs only.
2092             In this case:
2093
2094               LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2095
2096    When FOR_EPILOGUE_P is true, make this determination based on the
2097    assumption that LOOP_VINFO is an epilogue loop, otherwise make it
2098    based on the assumption that LOOP_VINFO is the main loop.  The caller
2099    has made sure that the number of iterations is set appropriately for
2100    this value of FOR_EPILOGUE_P.  */
2101
2102 opt_result
2103 vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo,
2104                                             bool for_epilogue_p)
2105 {
2106   /* Determine whether there would be any scalar iterations left over.  */
2107   bool need_peeling_or_partial_vectors_p
2108     = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
2109
2110   /* Decide whether to vectorize the loop with partial vectors.  */
2111   LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2112   LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2113   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2114       && need_peeling_or_partial_vectors_p)
2115     {
2116       /* For partial-vector-usage=1, try to push the handling of partial
2117          vectors to the epilogue, with the main loop continuing to operate
2118          on full vectors.
2119
2120          ??? We could then end up failing to use partial vectors if we
2121          decide to peel iterations into a prologue, and if the main loop
2122          then ends up processing fewer than VF iterations.  */
2123       if (param_vect_partial_vector_usage == 1
2124           && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2125           && !vect_known_niters_smaller_than_vf (loop_vinfo))
2126         LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2127       else
2128         LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2129     }
2130
2131   if (dump_enabled_p ())
2132     {
2133       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2134         dump_printf_loc (MSG_NOTE, vect_location,
2135                          "operating on partial vectors%s.\n",
2136                          for_epilogue_p ? " for epilogue loop" : "");
2137       else
2138         dump_printf_loc (MSG_NOTE, vect_location,
2139                          "operating only on full vectors%s.\n",
2140                          for_epilogue_p ? " for epilogue loop" : "");
2141     }
2142
2143   if (for_epilogue_p)
2144     {
2145       loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2146       gcc_assert (orig_loop_vinfo);
2147       if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2148         gcc_assert (known_lt (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2149                               LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)));
2150     }
2151
2152   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2153       && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2154     {
2155       /* Check that the loop processes at least one full vector.  */
2156       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2157       tree scalar_niters = LOOP_VINFO_NITERS (loop_vinfo);
2158       if (known_lt (wi::to_widest (scalar_niters), vf))
2159         return opt_result::failure_at (vect_location,
2160                                        "loop does not have enough iterations"
2161                                        " to support vectorization.\n");
2162
2163       /* If we need to peel an extra epilogue iteration to handle data
2164          accesses with gaps, check that there are enough scalar iterations
2165          available.
2166
2167          The check above is redundant with this one when peeling for gaps,
2168          but the distinction is useful for diagnostics.  */
2169       tree scalar_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
2170       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2171           && known_lt (wi::to_widest (scalar_nitersm1), vf))
2172         return opt_result::failure_at (vect_location,
2173                                        "loop does not have enough iterations"
2174                                        " to support peeling for gaps.\n");
2175     }
2176
2177   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2178     = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2179        && need_peeling_or_partial_vectors_p);
2180
2181   return opt_result::success ();
2182 }
2183
2184 /* Function vect_analyze_loop_2.
2185
2186    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2187    for it.  The different analyses will record information in the
2188    loop_vec_info struct.  */
2189 static opt_result
2190 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
2191 {
2192   opt_result ok = opt_result::success ();
2193   int res;
2194   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2195   poly_uint64 min_vf = 2;
2196   loop_vec_info orig_loop_vinfo = NULL;
2197
2198   /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2199      loop_vec_info of the first vectorized loop.  */
2200   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2201     orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2202   else
2203     orig_loop_vinfo = loop_vinfo;
2204   gcc_assert (orig_loop_vinfo);
2205
2206   /* The first group of checks is independent of the vector size.  */
2207   fatal = true;
2208
2209   if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2210       && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2211     return opt_result::failure_at (vect_location,
2212                                    "not vectorized: simd if(0)\n");
2213
2214   /* Find all data references in the loop (which correspond to vdefs/vuses)
2215      and analyze their evolution in the loop.  */
2216
2217   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2218
2219   /* Gather the data references and count stmts in the loop.  */
2220   if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2221     {
2222       opt_result res
2223         = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2224                                      &LOOP_VINFO_DATAREFS (loop_vinfo),
2225                                      n_stmts);
2226       if (!res)
2227         {
2228           if (dump_enabled_p ())
2229             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2230                              "not vectorized: loop contains function "
2231                              "calls or data references that cannot "
2232                              "be analyzed\n");
2233           return res;
2234         }
2235       loop_vinfo->shared->save_datarefs ();
2236     }
2237   else
2238     loop_vinfo->shared->check_datarefs ();
2239
2240   /* Analyze the data references and also adjust the minimal
2241      vectorization factor according to the loads and stores.  */
2242
2243   ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
2244   if (!ok)
2245     {
2246       if (dump_enabled_p ())
2247         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2248                          "bad data references.\n");
2249       return ok;
2250     }
2251
2252   /* Classify all cross-iteration scalar data-flow cycles.
2253      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
2254   vect_analyze_scalar_cycles (loop_vinfo);
2255
2256   vect_pattern_recog (loop_vinfo);
2257
2258   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2259
2260   /* Analyze the access patterns of the data-refs in the loop (consecutive,
2261      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
2262
2263   ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
2264   if (!ok)
2265     {
2266       if (dump_enabled_p ())
2267         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2268                          "bad data access.\n");
2269       return ok;
2270     }
2271
2272   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
2273
2274   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2275   if (!ok)
2276     {
2277       if (dump_enabled_p ())
2278         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2279                          "unexpected pattern.\n");
2280       return ok;
2281     }
2282
2283   /* While the rest of the analysis below depends on it in some way.  */
2284   fatal = false;
2285
2286   /* Analyze data dependences between the data-refs in the loop
2287      and adjust the maximum vectorization factor according to
2288      the dependences.
2289      FORNOW: fail at the first data dependence that we encounter.  */
2290
2291   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2292   if (!ok)
2293     {
2294       if (dump_enabled_p ())
2295         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2296                          "bad data dependence.\n");
2297       return ok;
2298     }
2299   if (max_vf != MAX_VECTORIZATION_FACTOR
2300       && maybe_lt (max_vf, min_vf))
2301     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2302   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2303
2304   ok = vect_determine_vectorization_factor (loop_vinfo);
2305   if (!ok)
2306     {
2307       if (dump_enabled_p ())
2308         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2309                          "can't determine vectorization factor.\n");
2310       return ok;
2311     }
2312   if (max_vf != MAX_VECTORIZATION_FACTOR
2313       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2314     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2315
2316   /* Compute the scalar iteration cost.  */
2317   vect_compute_single_scalar_iteration_cost (loop_vinfo);
2318
2319   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2320
2321   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
2322   ok = vect_analyze_slp (loop_vinfo, *n_stmts);
2323   if (!ok)
2324     return ok;
2325
2326   /* If there are any SLP instances mark them as pure_slp.  */
2327   bool slp = vect_make_slp_decision (loop_vinfo);
2328   if (slp)
2329     {
2330       /* Find stmts that need to be both vectorized and SLPed.  */
2331       vect_detect_hybrid_slp (loop_vinfo);
2332
2333       /* Update the vectorization factor based on the SLP decision.  */
2334       vect_update_vf_for_slp (loop_vinfo);
2335
2336       /* Optimize the SLP graph with the vectorization factor fixed.  */
2337       vect_optimize_slp (loop_vinfo);
2338
2339       /* Gather the loads reachable from the SLP graph entries.  */
2340       vect_gather_slp_loads (loop_vinfo);
2341     }
2342
2343   bool saved_can_use_partial_vectors_p
2344     = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2345
2346   /* We don't expect to have to roll back to anything other than an empty
2347      set of rgroups.  */
2348   gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2349
2350   /* This is the point where we can re-start analysis with SLP forced off.  */
2351 start_over:
2352
2353   /* Now the vectorization factor is final.  */
2354   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2355   gcc_assert (known_ne (vectorization_factor, 0U));
2356
2357   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2358     {
2359       dump_printf_loc (MSG_NOTE, vect_location,
2360                        "vectorization_factor = ");
2361       dump_dec (MSG_NOTE, vectorization_factor);
2362       dump_printf (MSG_NOTE, ", niters = %wd\n",
2363                    LOOP_VINFO_INT_NITERS (loop_vinfo));
2364     }
2365
2366   /* Analyze the alignment of the data-refs in the loop.
2367      Fail if a data reference is found that cannot be vectorized.  */
2368
2369   ok = vect_analyze_data_refs_alignment (loop_vinfo);
2370   if (!ok)
2371     {
2372       if (dump_enabled_p ())
2373         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2374                          "bad data alignment.\n");
2375       return ok;
2376     }
2377
2378   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2379      It is important to call pruning after vect_analyze_data_ref_accesses,
2380      since we use grouping information gathered by interleaving analysis.  */
2381   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2382   if (!ok)
2383     return ok;
2384
2385   /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2386      vectorization, since we do not want to add extra peeling or
2387      add versioning for alignment.  */
2388   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2389     /* This pass will decide on using loop versioning and/or loop peeling in
2390        order to enhance the alignment of data references in the loop.  */
2391     ok = vect_enhance_data_refs_alignment (loop_vinfo);
2392   if (!ok)
2393     return ok;
2394
2395   if (slp)
2396     {
2397       /* Analyze operations in the SLP instances.  Note this may
2398          remove unsupported SLP instances which makes the above
2399          SLP kind detection invalid.  */
2400       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2401       vect_slp_analyze_operations (loop_vinfo);
2402       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2403         {
2404           ok = opt_result::failure_at (vect_location,
2405                                        "unsupported SLP instances\n");
2406           goto again;
2407         }
2408
2409       /* Check whether any load in ALL SLP instances is possibly permuted.  */
2410       slp_tree load_node, slp_root;
2411       unsigned i, x;
2412       slp_instance instance;
2413       bool can_use_lanes = true;
2414       FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), x, instance)
2415         {
2416           slp_root = SLP_INSTANCE_TREE (instance);
2417           int group_size = SLP_TREE_LANES (slp_root);
2418           tree vectype = SLP_TREE_VECTYPE (slp_root);
2419           bool loads_permuted = false;
2420           FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2421             {
2422               if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
2423                 continue;
2424               unsigned j;
2425               stmt_vec_info load_info;
2426               FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2427                 if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
2428                   {
2429                     loads_permuted = true;
2430                     break;
2431                   }
2432             }
2433
2434           /* If the loads and stores can be handled with load/store-lane
2435              instructions record it and move on to the next instance.  */
2436           if (loads_permuted
2437               && SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2438               && vect_store_lanes_supported (vectype, group_size, false))
2439             {
2440               FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2441                 {
2442                   stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2443                       (SLP_TREE_SCALAR_STMTS (load_node)[0]);
2444                   /* Use SLP for strided accesses (or if we can't
2445                      load-lanes).  */
2446                   if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2447                       || ! vect_load_lanes_supported
2448                             (STMT_VINFO_VECTYPE (stmt_vinfo),
2449                              DR_GROUP_SIZE (stmt_vinfo), false))
2450                     break;
2451                 }
2452
2453               can_use_lanes
2454                 = can_use_lanes && i == SLP_INSTANCE_LOADS (instance).length ();
2455
2456               if (can_use_lanes && dump_enabled_p ())
2457                 dump_printf_loc (MSG_NOTE, vect_location,
2458                                  "SLP instance %p can use load/store-lanes\n",
2459                                  instance);
2460             }
2461           else
2462             {
2463               can_use_lanes = false;
2464               break;
2465             }
2466         }
2467
2468       /* If all SLP instances can use load/store-lanes abort SLP and try again
2469          with SLP disabled.  */
2470       if (can_use_lanes)
2471         {
2472           ok = opt_result::failure_at (vect_location,
2473                                        "Built SLP cancelled: can use "
2474                                        "load/store-lanes\n");
2475           if (dump_enabled_p ())
2476             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2477                              "Built SLP cancelled: all SLP instances support "
2478                              "load/store-lanes\n");
2479           goto again;
2480         }
2481     }
2482
2483   /* Dissolve SLP-only groups.  */
2484   vect_dissolve_slp_only_groups (loop_vinfo);
2485
2486   /* Scan all the remaining operations in the loop that are not subject
2487      to SLP and make sure they are vectorizable.  */
2488   ok = vect_analyze_loop_operations (loop_vinfo);
2489   if (!ok)
2490     {
2491       if (dump_enabled_p ())
2492         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2493                          "bad operation or unsupported loop bound.\n");
2494       return ok;
2495     }
2496
2497   /* For now, we don't expect to mix both masking and length approaches for one
2498      loop, disable it if both are recorded.  */
2499   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2500       && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
2501       && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
2502     {
2503       if (dump_enabled_p ())
2504         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2505                          "can't vectorize a loop with partial vectors"
2506                          " because we don't expect to mix different"
2507                          " approaches with partial vectors for the"
2508                          " same loop.\n");
2509       LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2510     }
2511
2512   /* If we still have the option of using partial vectors,
2513      check whether we can generate the necessary loop controls.  */
2514   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2515       && !vect_verify_full_masking (loop_vinfo)
2516       && !vect_verify_loop_lens (loop_vinfo))
2517     LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2518
2519   /* If we're vectorizing an epilogue loop, the vectorized loop either needs
2520      to be able to handle fewer than VF scalars, or needs to have a lower VF
2521      than the main loop.  */
2522   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2523       && !LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2524       && maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2525                    LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)))
2526     return opt_result::failure_at (vect_location,
2527                                    "Vectorization factor too high for"
2528                                    " epilogue loop.\n");
2529
2530   /* Decide whether this loop_vinfo should use partial vectors or peeling,
2531      assuming that the loop will be used as a main loop.  We will redo
2532      this analysis later if we instead decide to use the loop as an
2533      epilogue loop.  */
2534   ok = vect_determine_partial_vectors_and_peeling (loop_vinfo, false);
2535   if (!ok)
2536     return ok;
2537
2538   /* Check the costings of the loop make vectorizing worthwhile.  */
2539   res = vect_analyze_loop_costing (loop_vinfo);
2540   if (res < 0)
2541     {
2542       ok = opt_result::failure_at (vect_location,
2543                                    "Loop costings may not be worthwhile.\n");
2544       goto again;
2545     }
2546   if (!res)
2547     return opt_result::failure_at (vect_location,
2548                                    "Loop costings not worthwhile.\n");
2549
2550   /* If an epilogue loop is required make sure we can create one.  */
2551   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2552       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2553     {
2554       if (dump_enabled_p ())
2555         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2556       if (!vect_can_advance_ivs_p (loop_vinfo)
2557           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2558                                            single_exit (LOOP_VINFO_LOOP
2559                                                          (loop_vinfo))))
2560         {
2561           ok = opt_result::failure_at (vect_location,
2562                                        "not vectorized: can't create required "
2563                                        "epilog loop\n");
2564           goto again;
2565         }
2566     }
2567
2568   /* During peeling, we need to check if number of loop iterations is
2569      enough for both peeled prolog loop and vector loop.  This check
2570      can be merged along with threshold check of loop versioning, so
2571      increase threshold for this case if necessary.
2572
2573      If we are analyzing an epilogue we still want to check what its
2574      versioning threshold would be.  If we decide to vectorize the epilogues we
2575      will want to use the lowest versioning threshold of all epilogues and main
2576      loop.  This will enable us to enter a vectorized epilogue even when
2577      versioning the loop.  We can't simply check whether the epilogue requires
2578      versioning though since we may have skipped some versioning checks when
2579      analyzing the epilogue.  For instance, checks for alias versioning will be
2580      skipped when dealing with epilogues as we assume we already checked them
2581      for the main loop.  So instead we always check the 'orig_loop_vinfo'.  */
2582   if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
2583     {
2584       poly_uint64 niters_th = 0;
2585       unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2586
2587       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2588         {
2589           /* Niters for peeled prolog loop.  */
2590           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2591             {
2592               dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2593               tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2594               niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2595             }
2596           else
2597             niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2598         }
2599
2600       /* Niters for at least one iteration of vectorized loop.  */
2601       if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2602         niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2603       /* One additional iteration because of peeling for gap.  */
2604       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2605         niters_th += 1;
2606
2607       /*  Use the same condition as vect_transform_loop to decide when to use
2608           the cost to determine a versioning threshold.  */
2609       if (vect_apply_runtime_profitability_check_p (loop_vinfo)
2610           && ordered_p (th, niters_th))
2611         niters_th = ordered_max (poly_uint64 (th), niters_th);
2612
2613       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2614     }
2615
2616   gcc_assert (known_eq (vectorization_factor,
2617                         LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2618
2619   /* Ok to vectorize!  */
2620   return opt_result::success ();
2621
2622 again:
2623   /* Ensure that "ok" is false (with an opt_problem if dumping is enabled).  */
2624   gcc_assert (!ok);
2625
2626   /* Try again with SLP forced off but if we didn't do any SLP there is
2627      no point in re-trying.  */
2628   if (!slp)
2629     return ok;
2630
2631   /* If there are reduction chains re-trying will fail anyway.  */
2632   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2633     return ok;
2634
2635   /* Likewise if the grouped loads or stores in the SLP cannot be handled
2636      via interleaving or lane instructions.  */
2637   slp_instance instance;
2638   slp_tree node;
2639   unsigned i, j;
2640   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2641     {
2642       stmt_vec_info vinfo;
2643       vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2644       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2645         continue;
2646       vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2647       unsigned int size = DR_GROUP_SIZE (vinfo);
2648       tree vectype = STMT_VINFO_VECTYPE (vinfo);
2649       if (! vect_store_lanes_supported (vectype, size, false)
2650          && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2651          && ! vect_grouped_store_supported (vectype, size))
2652         return opt_result::failure_at (vinfo->stmt,
2653                                        "unsupported grouped store\n");
2654       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2655         {
2656           vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
2657           vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2658           bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2659           size = DR_GROUP_SIZE (vinfo);
2660           vectype = STMT_VINFO_VECTYPE (vinfo);
2661           if (! vect_load_lanes_supported (vectype, size, false)
2662               && ! vect_grouped_load_supported (vectype, single_element_p,
2663                                                 size))
2664             return opt_result::failure_at (vinfo->stmt,
2665                                            "unsupported grouped load\n");
2666         }
2667     }
2668
2669   if (dump_enabled_p ())
2670     dump_printf_loc (MSG_NOTE, vect_location,
2671                      "re-trying with SLP disabled\n");
2672
2673   /* Roll back state appropriately.  No SLP this time.  */
2674   slp = false;
2675   /* Restore vectorization factor as it were without SLP.  */
2676   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2677   /* Free the SLP instances.  */
2678   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2679     vect_free_slp_instance (instance);
2680   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2681   /* Reset SLP type to loop_vect on all stmts.  */
2682   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2683     {
2684       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2685       for (gimple_stmt_iterator si = gsi_start_phis (bb);
2686            !gsi_end_p (si); gsi_next (&si))
2687         {
2688           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2689           STMT_SLP_TYPE (stmt_info) = loop_vect;
2690           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2691               || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
2692             {
2693               /* vectorizable_reduction adjusts reduction stmt def-types,
2694                  restore them to that of the PHI.  */
2695               STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
2696                 = STMT_VINFO_DEF_TYPE (stmt_info);
2697               STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
2698                                         (STMT_VINFO_REDUC_DEF (stmt_info)))
2699                 = STMT_VINFO_DEF_TYPE (stmt_info);
2700             }
2701         }
2702       for (gimple_stmt_iterator si = gsi_start_bb (bb);
2703            !gsi_end_p (si); gsi_next (&si))
2704         {
2705           if (is_gimple_debug (gsi_stmt (si)))
2706             continue;
2707           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2708           STMT_SLP_TYPE (stmt_info) = loop_vect;
2709           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2710             {
2711               stmt_vec_info pattern_stmt_info
2712                 = STMT_VINFO_RELATED_STMT (stmt_info);
2713               if (STMT_VINFO_SLP_VECT_ONLY_PATTERN (pattern_stmt_info))
2714                 STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
2715
2716               gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2717               STMT_SLP_TYPE (pattern_stmt_info) = loop_vect;
2718               for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2719                    !gsi_end_p (pi); gsi_next (&pi))
2720                 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2721                   = loop_vect;
2722             }
2723         }
2724     }
2725   /* Free optimized alias test DDRS.  */
2726   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2727   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2728   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2729   /* Reset target cost data.  */
2730   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2731   LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2732     = init_cost (LOOP_VINFO_LOOP (loop_vinfo), false);
2733   /* Reset accumulated rgroup information.  */
2734   release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo));
2735   release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
2736   /* Reset assorted flags.  */
2737   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2738   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2739   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2740   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2741   LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2742     = saved_can_use_partial_vectors_p;
2743
2744   goto start_over;
2745 }
2746
2747 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
2748    to be better than vectorizing it using OLD_LOOP_VINFO.  Assume that
2749    OLD_LOOP_VINFO is better unless something specifically indicates
2750    otherwise.
2751
2752    Note that this deliberately isn't a partial order.  */
2753
2754 static bool
2755 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
2756                           loop_vec_info old_loop_vinfo)
2757 {
2758   struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
2759   gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
2760
2761   poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
2762   poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
2763
2764   /* Always prefer a VF of loop->simdlen over any other VF.  */
2765   if (loop->simdlen)
2766     {
2767       bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
2768       bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
2769       if (new_simdlen_p != old_simdlen_p)
2770         return new_simdlen_p;
2771     }
2772
2773   /* Limit the VFs to what is likely to be the maximum number of iterations,
2774      to handle cases in which at least one loop_vinfo is fully-masked.  */
2775   HOST_WIDE_INT estimated_max_niter;
2776   loop_vec_info main_loop = LOOP_VINFO_ORIG_LOOP_INFO (old_loop_vinfo);
2777   unsigned HOST_WIDE_INT main_vf;
2778   if (main_loop
2779       && LOOP_VINFO_NITERS_KNOWN_P (main_loop)
2780       && LOOP_VINFO_VECT_FACTOR (main_loop).is_constant (&main_vf))
2781     estimated_max_niter = LOOP_VINFO_INT_NITERS (main_loop) % main_vf;
2782   else
2783     estimated_max_niter = likely_max_stmt_executions_int (loop);
2784   if (estimated_max_niter != -1)
2785     {
2786       if (known_le (estimated_max_niter, new_vf))
2787         new_vf = estimated_max_niter;
2788       if (known_le (estimated_max_niter, old_vf))
2789         old_vf = estimated_max_niter;
2790     }
2791
2792   /* Check whether the (fractional) cost per scalar iteration is lower
2793      or higher: new_inside_cost / new_vf vs. old_inside_cost / old_vf.  */
2794   poly_int64 rel_new = new_loop_vinfo->vec_inside_cost * old_vf;
2795   poly_int64 rel_old = old_loop_vinfo->vec_inside_cost * new_vf;
2796
2797   HOST_WIDE_INT est_rel_new_min
2798     = estimated_poly_value (rel_new, POLY_VALUE_MIN);
2799   HOST_WIDE_INT est_rel_new_max
2800     = estimated_poly_value (rel_new, POLY_VALUE_MAX);
2801
2802   HOST_WIDE_INT est_rel_old_min
2803     = estimated_poly_value (rel_old, POLY_VALUE_MIN);
2804   HOST_WIDE_INT est_rel_old_max
2805     = estimated_poly_value (rel_old, POLY_VALUE_MAX);
2806
2807   /* Check first if we can make out an unambigous total order from the minimum
2808      and maximum estimates.  */
2809   if (est_rel_new_min < est_rel_old_min
2810       && est_rel_new_max < est_rel_old_max)
2811     return true;
2812   else if (est_rel_old_min < est_rel_new_min
2813            && est_rel_old_max < est_rel_new_max)
2814     return false;
2815   /* When old_loop_vinfo uses a variable vectorization factor,
2816      we know that it has a lower cost for at least one runtime VF.
2817      However, we don't know how likely that VF is.
2818
2819      One option would be to compare the costs for the estimated VFs.
2820      The problem is that that can put too much pressure on the cost
2821      model.  E.g. if the estimated VF is also the lowest possible VF,
2822      and if old_loop_vinfo is 1 unit worse than new_loop_vinfo
2823      for the estimated VF, we'd then choose new_loop_vinfo even
2824      though (a) new_loop_vinfo might not actually be better than
2825      old_loop_vinfo for that VF and (b) it would be significantly
2826      worse at larger VFs.
2827
2828      Here we go for a hacky compromise: pick new_loop_vinfo if it is
2829      no more expensive than old_loop_vinfo even after doubling the
2830      estimated old_loop_vinfo VF.  For all but trivial loops, this
2831      ensures that we only pick new_loop_vinfo if it is significantly
2832      better than old_loop_vinfo at the estimated VF.  */
2833
2834   if (est_rel_old_min != est_rel_new_min
2835       || est_rel_old_max != est_rel_new_max)
2836     {
2837       HOST_WIDE_INT est_rel_new_likely
2838         = estimated_poly_value (rel_new, POLY_VALUE_LIKELY);
2839       HOST_WIDE_INT est_rel_old_likely
2840         = estimated_poly_value (rel_old, POLY_VALUE_LIKELY);
2841
2842       return est_rel_new_likely * 2 <= est_rel_old_likely;
2843     }
2844
2845   /* If there's nothing to choose between the loop bodies, see whether
2846      there's a difference in the prologue and epilogue costs.  */
2847   if (new_loop_vinfo->vec_outside_cost != old_loop_vinfo->vec_outside_cost)
2848     return new_loop_vinfo->vec_outside_cost < old_loop_vinfo->vec_outside_cost;
2849
2850   return false;
2851 }
2852
2853 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO.  Return
2854    true if we should.  */
2855
2856 static bool
2857 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
2858                         loop_vec_info old_loop_vinfo)
2859 {
2860   if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
2861     return false;
2862
2863   if (dump_enabled_p ())
2864     dump_printf_loc (MSG_NOTE, vect_location,
2865                      "***** Preferring vector mode %s to vector mode %s\n",
2866                      GET_MODE_NAME (new_loop_vinfo->vector_mode),
2867                      GET_MODE_NAME (old_loop_vinfo->vector_mode));
2868   return true;
2869 }
2870
2871 /* If LOOP_VINFO is already a main loop, return it unmodified.  Otherwise
2872    try to reanalyze it as a main loop.  Return the loop_vinfo on success
2873    and null on failure.  */
2874
2875 static loop_vec_info
2876 vect_reanalyze_as_main_loop (loop_vec_info loop_vinfo, unsigned int *n_stmts)
2877 {
2878   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2879     return loop_vinfo;
2880
2881   if (dump_enabled_p ())
2882     dump_printf_loc (MSG_NOTE, vect_location,
2883                      "***** Reanalyzing as a main loop with vector mode %s\n",
2884                      GET_MODE_NAME (loop_vinfo->vector_mode));
2885
2886   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2887   vec_info_shared *shared = loop_vinfo->shared;
2888   opt_loop_vec_info main_loop_vinfo = vect_analyze_loop_form (loop, shared);
2889   gcc_assert (main_loop_vinfo);
2890
2891   main_loop_vinfo->vector_mode = loop_vinfo->vector_mode;
2892
2893   bool fatal = false;
2894   bool res = vect_analyze_loop_2 (main_loop_vinfo, fatal, n_stmts);
2895   loop->aux = NULL;
2896   if (!res)
2897     {
2898       if (dump_enabled_p ())
2899         dump_printf_loc (MSG_NOTE, vect_location,
2900                          "***** Failed to analyze main loop with vector"
2901                          " mode %s\n",
2902                          GET_MODE_NAME (loop_vinfo->vector_mode));
2903       delete main_loop_vinfo;
2904       return NULL;
2905     }
2906   LOOP_VINFO_VECTORIZABLE_P (main_loop_vinfo) = 1;
2907   return main_loop_vinfo;
2908 }
2909
2910 /* Function vect_analyze_loop.
2911
2912    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2913    for it.  The different analyses will record information in the
2914    loop_vec_info struct.  */
2915 opt_loop_vec_info
2916 vect_analyze_loop (class loop *loop, vec_info_shared *shared)
2917 {
2918   auto_vector_modes vector_modes;
2919
2920   /* Autodetect first vector size we try.  */
2921   unsigned int autovec_flags
2922     = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
2923                                                     loop->simdlen != 0);
2924   unsigned int mode_i = 0;
2925
2926   DUMP_VECT_SCOPE ("analyze_loop_nest");
2927
2928   if (loop_outer (loop)
2929       && loop_vec_info_for_loop (loop_outer (loop))
2930       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2931     return opt_loop_vec_info::failure_at (vect_location,
2932                                           "outer-loop already vectorized.\n");
2933
2934   if (!find_loop_nest (loop, &shared->loop_nest))
2935     return opt_loop_vec_info::failure_at
2936       (vect_location,
2937        "not vectorized: loop nest containing two or more consecutive inner"
2938        " loops cannot be vectorized\n");
2939
2940   unsigned n_stmts = 0;
2941   machine_mode autodetected_vector_mode = VOIDmode;
2942   opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
2943   machine_mode next_vector_mode = VOIDmode;
2944   poly_uint64 lowest_th = 0;
2945   unsigned vectorized_loops = 0;
2946   bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
2947                              && !unlimited_cost_model (loop));
2948
2949   bool vect_epilogues = false;
2950   opt_result res = opt_result::success ();
2951   unsigned HOST_WIDE_INT simdlen = loop->simdlen;
2952   while (1)
2953     {
2954       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
2955       opt_loop_vec_info loop_vinfo = vect_analyze_loop_form (loop, shared);
2956       if (!loop_vinfo)
2957         {
2958           if (dump_enabled_p ())
2959             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2960                              "bad loop form.\n");
2961           gcc_checking_assert (first_loop_vinfo == NULL);
2962           return loop_vinfo;
2963         }
2964       loop_vinfo->vector_mode = next_vector_mode;
2965
2966       bool fatal = false;
2967
2968       /* When pick_lowest_cost_p is true, we should in principle iterate
2969          over all the loop_vec_infos that LOOP_VINFO could replace and
2970          try to vectorize LOOP_VINFO under the same conditions.
2971          E.g. when trying to replace an epilogue loop, we should vectorize
2972          LOOP_VINFO as an epilogue loop with the same VF limit.  When trying
2973          to replace the main loop, we should vectorize LOOP_VINFO as a main
2974          loop too.
2975
2976          However, autovectorize_vector_modes is usually sorted as follows:
2977
2978          - Modes that naturally produce lower VFs usually follow modes that
2979            naturally produce higher VFs.
2980
2981          - When modes naturally produce the same VF, maskable modes
2982            usually follow unmaskable ones, so that the maskable mode
2983            can be used to vectorize the epilogue of the unmaskable mode.
2984
2985          This order is preferred because it leads to the maximum
2986          epilogue vectorization opportunities.  Targets should only use
2987          a different order if they want to make wide modes available while
2988          disparaging them relative to earlier, smaller modes.  The assumption
2989          in that case is that the wider modes are more expensive in some
2990          way that isn't reflected directly in the costs.
2991
2992          There should therefore be few interesting cases in which
2993          LOOP_VINFO fails when treated as an epilogue loop, succeeds when
2994          treated as a standalone loop, and ends up being genuinely cheaper
2995          than FIRST_LOOP_VINFO.  */
2996       if (vect_epilogues)
2997         LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = first_loop_vinfo;
2998
2999       res = vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts);
3000       if (mode_i == 0)
3001         autodetected_vector_mode = loop_vinfo->vector_mode;
3002       if (dump_enabled_p ())
3003         {
3004           if (res)
3005             dump_printf_loc (MSG_NOTE, vect_location,
3006                              "***** Analysis succeeded with vector mode %s\n",
3007                              GET_MODE_NAME (loop_vinfo->vector_mode));
3008           else
3009             dump_printf_loc (MSG_NOTE, vect_location,
3010                              "***** Analysis failed with vector mode %s\n",
3011                              GET_MODE_NAME (loop_vinfo->vector_mode));
3012         }
3013
3014       loop->aux = NULL;
3015
3016       if (!fatal)
3017         while (mode_i < vector_modes.length ()
3018                && vect_chooses_same_modes_p (loop_vinfo, vector_modes[mode_i]))
3019           {
3020             if (dump_enabled_p ())
3021               dump_printf_loc (MSG_NOTE, vect_location,
3022                                "***** The result for vector mode %s would"
3023                                " be the same\n",
3024                                GET_MODE_NAME (vector_modes[mode_i]));
3025             mode_i += 1;
3026           }
3027
3028       if (res)
3029         {
3030           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
3031           vectorized_loops++;
3032
3033           /* Once we hit the desired simdlen for the first time,
3034              discard any previous attempts.  */
3035           if (simdlen
3036               && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
3037             {
3038               delete first_loop_vinfo;
3039               first_loop_vinfo = opt_loop_vec_info::success (NULL);
3040               LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = NULL;
3041               simdlen = 0;
3042             }
3043           else if (pick_lowest_cost_p && first_loop_vinfo)
3044             {
3045               /* Keep trying to roll back vectorization attempts while the
3046                  loop_vec_infos they produced were worse than this one.  */
3047               vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
3048               while (!vinfos.is_empty ()
3049                      && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
3050                 {
3051                   gcc_assert (vect_epilogues);
3052                   delete vinfos.pop ();
3053                 }
3054               if (vinfos.is_empty ()
3055                   && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
3056                 {
3057                   loop_vec_info main_loop_vinfo
3058                     = vect_reanalyze_as_main_loop (loop_vinfo, &n_stmts);
3059                   if (main_loop_vinfo == loop_vinfo)
3060                     {
3061                       delete first_loop_vinfo;
3062                       first_loop_vinfo = opt_loop_vec_info::success (NULL);
3063                     }
3064                   else if (main_loop_vinfo
3065                            && vect_joust_loop_vinfos (main_loop_vinfo,
3066                                                       first_loop_vinfo))
3067                     {
3068                       delete first_loop_vinfo;
3069                       first_loop_vinfo = opt_loop_vec_info::success (NULL);
3070                       delete loop_vinfo;
3071                       loop_vinfo
3072                         = opt_loop_vec_info::success (main_loop_vinfo);
3073                     }
3074                   else
3075                     {
3076                       if (dump_enabled_p ())
3077                         dump_printf_loc (MSG_NOTE, vect_location,
3078                                          "***** No longer preferring vector"
3079                                          " mode %s after reanalyzing the loop"
3080                                          " as a main loop\n",
3081                                          GET_MODE_NAME
3082                                            (main_loop_vinfo->vector_mode));
3083                       delete main_loop_vinfo;
3084                     }
3085                 }
3086             }
3087
3088           if (first_loop_vinfo == NULL)
3089             {
3090               first_loop_vinfo = loop_vinfo;
3091               lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
3092             }
3093           else if (vect_epilogues
3094                    /* For now only allow one epilogue loop.  */
3095                    && first_loop_vinfo->epilogue_vinfos.is_empty ())
3096             {
3097               first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
3098               poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3099               gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
3100                           || maybe_ne (lowest_th, 0U));
3101               /* Keep track of the known smallest versioning
3102                  threshold.  */
3103               if (ordered_p (lowest_th, th))
3104                 lowest_th = ordered_min (lowest_th, th);
3105             }
3106           else
3107             {
3108               delete loop_vinfo;
3109               loop_vinfo = opt_loop_vec_info::success (NULL);
3110             }
3111
3112           /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
3113              enabled, SIMDUID is not set, it is the innermost loop and we have
3114              either already found the loop's SIMDLEN or there was no SIMDLEN to
3115              begin with.
3116              TODO: Enable epilogue vectorization for loops with SIMDUID set.  */
3117           vect_epilogues = (!simdlen
3118                             && loop->inner == NULL
3119                             && param_vect_epilogues_nomask
3120                             && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
3121                             && !loop->simduid
3122                             /* For now only allow one epilogue loop, but allow
3123                                pick_lowest_cost_p to replace it.  */
3124                             && (first_loop_vinfo->epilogue_vinfos.is_empty ()
3125                                 || pick_lowest_cost_p));
3126
3127           /* Commit to first_loop_vinfo if we have no reason to try
3128              alternatives.  */
3129           if (!simdlen && !vect_epilogues && !pick_lowest_cost_p)
3130             break;
3131         }
3132       else
3133         {
3134           delete loop_vinfo;
3135           loop_vinfo = opt_loop_vec_info::success (NULL);
3136           if (fatal)
3137             {
3138               gcc_checking_assert (first_loop_vinfo == NULL);
3139               break;
3140             }
3141         }
3142
3143       /* Handle the case that the original loop can use partial
3144          vectorization, but want to only adopt it for the epilogue.
3145          The retry should be in the same mode as original.  */
3146       if (vect_epilogues
3147           && loop_vinfo
3148           && LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo))
3149         {
3150           gcc_assert (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3151                       && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo));
3152           if (dump_enabled_p ())
3153             dump_printf_loc (MSG_NOTE, vect_location,
3154                              "***** Re-trying analysis with same vector mode"
3155                              " %s for epilogue with partial vectors.\n",
3156                              GET_MODE_NAME (loop_vinfo->vector_mode));
3157           continue;
3158         }
3159
3160       if (mode_i < vector_modes.length ()
3161           && VECTOR_MODE_P (autodetected_vector_mode)
3162           && (related_vector_mode (vector_modes[mode_i],
3163                                    GET_MODE_INNER (autodetected_vector_mode))
3164               == autodetected_vector_mode)
3165           && (related_vector_mode (autodetected_vector_mode,
3166                                    GET_MODE_INNER (vector_modes[mode_i]))
3167               == vector_modes[mode_i]))
3168         {
3169           if (dump_enabled_p ())
3170             dump_printf_loc (MSG_NOTE, vect_location,
3171                              "***** Skipping vector mode %s, which would"
3172                              " repeat the analysis for %s\n",
3173                              GET_MODE_NAME (vector_modes[mode_i]),
3174                              GET_MODE_NAME (autodetected_vector_mode));
3175           mode_i += 1;
3176         }
3177
3178       if (mode_i == vector_modes.length ()
3179           || autodetected_vector_mode == VOIDmode)
3180         break;
3181
3182       /* Try the next biggest vector size.  */
3183       next_vector_mode = vector_modes[mode_i++];
3184       if (dump_enabled_p ())
3185         dump_printf_loc (MSG_NOTE, vect_location,
3186                          "***** Re-trying analysis with vector mode %s\n",
3187                          GET_MODE_NAME (next_vector_mode));
3188     }
3189
3190   if (first_loop_vinfo)
3191     {
3192       loop->aux = (loop_vec_info) first_loop_vinfo;
3193       if (dump_enabled_p ())
3194         dump_printf_loc (MSG_NOTE, vect_location,
3195                          "***** Choosing vector mode %s\n",
3196                          GET_MODE_NAME (first_loop_vinfo->vector_mode));
3197       LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
3198       return first_loop_vinfo;
3199     }
3200
3201   return opt_loop_vec_info::propagate_failure (res);
3202 }
3203
3204 /* Return true if there is an in-order reduction function for CODE, storing
3205    it in *REDUC_FN if so.  */
3206
3207 static bool
3208 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
3209 {
3210   switch (code)
3211     {
3212     case PLUS_EXPR:
3213       *reduc_fn = IFN_FOLD_LEFT_PLUS;
3214       return true;
3215
3216     default:
3217       return false;
3218     }
3219 }
3220
3221 /* Function reduction_fn_for_scalar_code
3222
3223    Input:
3224    CODE - tree_code of a reduction operations.
3225
3226    Output:
3227    REDUC_FN - the corresponding internal function to be used to reduce the
3228       vector of partial results into a single scalar result, or IFN_LAST
3229       if the operation is a supported reduction operation, but does not have
3230       such an internal function.
3231
3232    Return FALSE if CODE currently cannot be vectorized as reduction.  */
3233
3234 bool
3235 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
3236 {
3237   switch (code)
3238     {
3239       case MAX_EXPR:
3240         *reduc_fn = IFN_REDUC_MAX;
3241         return true;
3242
3243       case MIN_EXPR:
3244         *reduc_fn = IFN_REDUC_MIN;
3245         return true;
3246
3247       case PLUS_EXPR:
3248         *reduc_fn = IFN_REDUC_PLUS;
3249         return true;
3250
3251       case BIT_AND_EXPR:
3252         *reduc_fn = IFN_REDUC_AND;
3253         return true;
3254
3255       case BIT_IOR_EXPR:
3256         *reduc_fn = IFN_REDUC_IOR;
3257         return true;
3258
3259       case BIT_XOR_EXPR:
3260         *reduc_fn = IFN_REDUC_XOR;
3261         return true;
3262
3263       case MULT_EXPR:
3264       case MINUS_EXPR:
3265         *reduc_fn = IFN_LAST;
3266         return true;
3267
3268       default:
3269        return false;
3270     }
3271 }
3272
3273 /* If there is a neutral value X such that a reduction would not be affected
3274    by the introduction of additional X elements, return that X, otherwise
3275    return null.  CODE is the code of the reduction and SCALAR_TYPE is type
3276    of the scalar elements.  If the reduction has just a single initial value
3277    then INITIAL_VALUE is that value, otherwise it is null.  */
3278
3279 static tree
3280 neutral_op_for_reduction (tree scalar_type, tree_code code, tree initial_value)
3281 {
3282   switch (code)
3283     {
3284     case WIDEN_SUM_EXPR:
3285     case DOT_PROD_EXPR:
3286     case SAD_EXPR:
3287     case PLUS_EXPR:
3288     case MINUS_EXPR:
3289     case BIT_IOR_EXPR:
3290     case BIT_XOR_EXPR:
3291       return build_zero_cst (scalar_type);
3292
3293     case MULT_EXPR:
3294       return build_one_cst (scalar_type);
3295
3296     case BIT_AND_EXPR:
3297       return build_all_ones_cst (scalar_type);
3298
3299     case MAX_EXPR:
3300     case MIN_EXPR:
3301       return initial_value;
3302
3303     default:
3304       return NULL_TREE;
3305     }
3306 }
3307
3308 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
3309    STMT is printed with a message MSG. */
3310
3311 static void
3312 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3313 {
3314   dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3315 }
3316
3317 /* Return true if we need an in-order reduction for operation CODE
3318    on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3319    overflow must wrap.  */
3320
3321 bool
3322 needs_fold_left_reduction_p (tree type, tree_code code)
3323 {
3324   /* CHECKME: check for !flag_finite_math_only too?  */
3325   if (SCALAR_FLOAT_TYPE_P (type))
3326     switch (code)
3327       {
3328       case MIN_EXPR:
3329       case MAX_EXPR:
3330         return false;
3331
3332       default:
3333         return !flag_associative_math;
3334       }
3335
3336   if (INTEGRAL_TYPE_P (type))
3337     {
3338       if (!operation_no_trapping_overflow (type, code))
3339         return true;
3340       return false;
3341     }
3342
3343   if (SAT_FIXED_POINT_TYPE_P (type))
3344     return true;
3345
3346   return false;
3347 }
3348
3349 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3350    has a handled computation expression.  Store the main reduction
3351    operation in *CODE.  */
3352
3353 static bool
3354 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3355                       tree loop_arg, enum tree_code *code,
3356                       vec<std::pair<ssa_op_iter, use_operand_p> > &path)
3357 {
3358   auto_bitmap visited;
3359   tree lookfor = PHI_RESULT (phi);
3360   ssa_op_iter curri;
3361   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
3362   while (USE_FROM_PTR (curr) != loop_arg)
3363     curr = op_iter_next_use (&curri);
3364   curri.i = curri.numops;
3365   do
3366     {
3367       path.safe_push (std::make_pair (curri, curr));
3368       tree use = USE_FROM_PTR (curr);
3369       if (use == lookfor)
3370         break;
3371       gimple *def = SSA_NAME_DEF_STMT (use);
3372       if (gimple_nop_p (def)
3373           || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3374         {
3375 pop:
3376           do
3377             {
3378               std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3379               curri = x.first;
3380               curr = x.second;
3381               do
3382                 curr = op_iter_next_use (&curri);
3383               /* Skip already visited or non-SSA operands (from iterating
3384                  over PHI args).  */
3385               while (curr != NULL_USE_OPERAND_P
3386                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3387                          || ! bitmap_set_bit (visited,
3388                                               SSA_NAME_VERSION
3389                                                 (USE_FROM_PTR (curr)))));
3390             }
3391           while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
3392           if (curr == NULL_USE_OPERAND_P)
3393             break;
3394         }
3395       else
3396         {
3397           if (gimple_code (def) == GIMPLE_PHI)
3398             curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
3399           else
3400             curr = op_iter_init_use (&curri, def, SSA_OP_USE);
3401           while (curr != NULL_USE_OPERAND_P
3402                  && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3403                      || ! bitmap_set_bit (visited,
3404                                           SSA_NAME_VERSION
3405                                             (USE_FROM_PTR (curr)))))
3406             curr = op_iter_next_use (&curri);
3407           if (curr == NULL_USE_OPERAND_P)
3408             goto pop;
3409         }
3410     }
3411   while (1);
3412   if (dump_file && (dump_flags & TDF_DETAILS))
3413     {
3414       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
3415       unsigned i;
3416       std::pair<ssa_op_iter, use_operand_p> *x;
3417       FOR_EACH_VEC_ELT (path, i, x)
3418         dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
3419       dump_printf (MSG_NOTE, "\n");
3420     }
3421
3422   /* Check whether the reduction path detected is valid.  */
3423   bool fail = path.length () == 0;
3424   bool neg = false;
3425   int sign = -1;
3426   *code = ERROR_MARK;
3427   for (unsigned i = 1; i < path.length (); ++i)
3428     {
3429       gimple *use_stmt = USE_STMT (path[i].second);
3430       tree op = USE_FROM_PTR (path[i].second);
3431       if (! is_gimple_assign (use_stmt)
3432           /* The following make sure we can compute the operand index
3433              easily plus it mostly disallows chaining via COND_EXPR condition
3434              operands.  */
3435           || (gimple_assign_rhs1_ptr (use_stmt) != path[i].second->use
3436               && (gimple_num_ops (use_stmt) <= 2
3437                   || gimple_assign_rhs2_ptr (use_stmt) != path[i].second->use)
3438               && (gimple_num_ops (use_stmt) <= 3
3439                   || gimple_assign_rhs3_ptr (use_stmt) != path[i].second->use)))
3440         {
3441           fail = true;
3442           break;
3443         }
3444       tree_code use_code = gimple_assign_rhs_code (use_stmt);
3445       if (use_code == MINUS_EXPR)
3446         {
3447           use_code = PLUS_EXPR;
3448           /* Track whether we negate the reduction value each iteration.  */
3449           if (gimple_assign_rhs2 (use_stmt) == op)
3450             neg = ! neg;
3451         }
3452       if (CONVERT_EXPR_CODE_P (use_code)
3453           && tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (use_stmt)),
3454                                     TREE_TYPE (gimple_assign_rhs1 (use_stmt))))
3455         ;
3456       else if (*code == ERROR_MARK)
3457         {
3458           *code = use_code;
3459           sign = TYPE_SIGN (TREE_TYPE (gimple_assign_lhs (use_stmt)));
3460         }
3461       else if (use_code != *code)
3462         {
3463           fail = true;
3464           break;
3465         }
3466       else if ((use_code == MIN_EXPR
3467                 || use_code == MAX_EXPR)
3468                && sign != TYPE_SIGN (TREE_TYPE (gimple_assign_lhs (use_stmt))))
3469         {
3470           fail = true;
3471           break;
3472         }
3473       /* Check there's only a single stmt the op is used on.  For the
3474          not value-changing tail and the last stmt allow out-of-loop uses.
3475          ???  We could relax this and handle arbitrary live stmts by
3476          forcing a scalar epilogue for example.  */
3477       imm_use_iterator imm_iter;
3478       gimple *op_use_stmt;
3479       unsigned cnt = 0;
3480       FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op)
3481         if (!is_gimple_debug (op_use_stmt)
3482             && (*code != ERROR_MARK
3483                 || flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt))))
3484           {
3485             /* We want to allow x + x but not x < 1 ? x : 2.  */
3486             if (is_gimple_assign (op_use_stmt)
3487                 && gimple_assign_rhs_code (op_use_stmt) == COND_EXPR)
3488               {
3489                 use_operand_p use_p;
3490                 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
3491                   cnt++;
3492               }
3493             else
3494               cnt++;
3495           }
3496       if (cnt != 1)
3497         {
3498           fail = true;
3499           break;
3500         }
3501     }
3502   return ! fail && ! neg && *code != ERROR_MARK;
3503 }
3504
3505 bool
3506 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3507                       tree loop_arg, enum tree_code code)
3508 {
3509   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3510   enum tree_code code_;
3511   return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
3512           && code_ == code);
3513 }
3514
3515
3516
3517 /* Function vect_is_simple_reduction
3518
3519    (1) Detect a cross-iteration def-use cycle that represents a simple
3520    reduction computation.  We look for the following pattern:
3521
3522    loop_header:
3523      a1 = phi < a0, a2 >
3524      a3 = ...
3525      a2 = operation (a3, a1)
3526
3527    or
3528
3529    a3 = ...
3530    loop_header:
3531      a1 = phi < a0, a2 >
3532      a2 = operation (a3, a1)
3533
3534    such that:
3535    1. operation is commutative and associative and it is safe to
3536       change the order of the computation
3537    2. no uses for a2 in the loop (a2 is used out of the loop)
3538    3. no uses of a1 in the loop besides the reduction operation
3539    4. no uses of a1 outside the loop.
3540
3541    Conditions 1,4 are tested here.
3542    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
3543
3544    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
3545    nested cycles.
3546
3547    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
3548    reductions:
3549
3550      a1 = phi < a0, a2 >
3551      inner loop (def of a3)
3552      a2 = phi < a3 >
3553
3554    (4) Detect condition expressions, ie:
3555      for (int i = 0; i < N; i++)
3556        if (a[i] < val)
3557         ret_val = a[i];
3558
3559 */
3560
3561 static stmt_vec_info
3562 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3563                           bool *double_reduc, bool *reduc_chain_p)
3564 {
3565   gphi *phi = as_a <gphi *> (phi_info->stmt);
3566   gimple *phi_use_stmt = NULL;
3567   imm_use_iterator imm_iter;
3568   use_operand_p use_p;
3569
3570   *double_reduc = false;
3571   *reduc_chain_p = false;
3572   STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
3573
3574   tree phi_name = PHI_RESULT (phi);
3575   /* ???  If there are no uses of the PHI result the inner loop reduction
3576      won't be detected as possibly double-reduction by vectorizable_reduction
3577      because that tries to walk the PHI arg from the preheader edge which
3578      can be constant.  See PR60382.  */
3579   if (has_zero_uses (phi_name))
3580     return NULL;
3581   class loop *loop = (gimple_bb (phi))->loop_father;
3582   unsigned nphi_def_loop_uses = 0;
3583   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
3584     {
3585       gimple *use_stmt = USE_STMT (use_p);
3586       if (is_gimple_debug (use_stmt))
3587         continue;
3588
3589       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3590         {
3591           if (dump_enabled_p ())
3592             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3593                              "intermediate value used outside loop.\n");
3594
3595           return NULL;
3596         }
3597
3598       nphi_def_loop_uses++;
3599       phi_use_stmt = use_stmt;
3600     }
3601
3602   tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
3603   if (TREE_CODE (latch_def) != SSA_NAME)
3604     {
3605       if (dump_enabled_p ())
3606         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3607                          "reduction: not ssa_name: %T\n", latch_def);
3608       return NULL;
3609     }
3610
3611   stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
3612   if (!def_stmt_info
3613       || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
3614     return NULL;
3615
3616   bool nested_in_vect_loop
3617     = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
3618   unsigned nlatch_def_loop_uses = 0;
3619   auto_vec<gphi *, 3> lcphis;
3620   bool inner_loop_of_double_reduc = false;
3621   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
3622     {
3623       gimple *use_stmt = USE_STMT (use_p);
3624       if (is_gimple_debug (use_stmt))
3625         continue;
3626       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3627         nlatch_def_loop_uses++;
3628       else
3629         {
3630           /* We can have more than one loop-closed PHI.  */
3631           lcphis.safe_push (as_a <gphi *> (use_stmt));
3632           if (nested_in_vect_loop
3633               && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
3634                   == vect_double_reduction_def))
3635             inner_loop_of_double_reduc = true;
3636         }
3637     }
3638
3639   /* If we are vectorizing an inner reduction we are executing that
3640      in the original order only in case we are not dealing with a
3641      double reduction.  */
3642   if (nested_in_vect_loop && !inner_loop_of_double_reduc)
3643     {
3644       if (dump_enabled_p ())
3645         report_vect_op (MSG_NOTE, def_stmt_info->stmt,
3646                         "detected nested cycle: ");
3647       return def_stmt_info;
3648     }
3649
3650   /* If this isn't a nested cycle or if the nested cycle reduction value
3651      is used ouside of the inner loop we cannot handle uses of the reduction
3652      value.  */
3653   if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
3654     {
3655       if (dump_enabled_p ())
3656         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3657                          "reduction used in loop.\n");
3658       return NULL;
3659     }
3660
3661   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3662      defined in the inner loop.  */
3663   if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
3664     {
3665       tree op1 = PHI_ARG_DEF (def_stmt, 0);
3666       if (gimple_phi_num_args (def_stmt) != 1
3667           || TREE_CODE (op1) != SSA_NAME)
3668         {
3669           if (dump_enabled_p ())
3670             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3671                              "unsupported phi node definition.\n");
3672
3673           return NULL;
3674         }
3675
3676       gimple *def1 = SSA_NAME_DEF_STMT (op1);
3677       if (gimple_bb (def1)
3678           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3679           && loop->inner
3680           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3681           && is_gimple_assign (def1)
3682           && is_a <gphi *> (phi_use_stmt)
3683           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3684         {
3685           if (dump_enabled_p ())
3686             report_vect_op (MSG_NOTE, def_stmt,
3687                             "detected double reduction: ");
3688
3689           *double_reduc = true;
3690           return def_stmt_info;
3691         }
3692
3693       return NULL;
3694     }
3695
3696   /* Look for the expression computing latch_def from then loop PHI result.  */
3697   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3698   enum tree_code code;
3699   if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
3700                             path))
3701     {
3702       STMT_VINFO_REDUC_CODE (phi_info) = code;
3703       if (code == COND_EXPR && !nested_in_vect_loop)
3704         STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
3705
3706       /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
3707          reduction chain for which the additional restriction is that
3708          all operations in the chain are the same.  */
3709       auto_vec<stmt_vec_info, 8> reduc_chain;
3710       unsigned i;
3711       bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
3712       for (i = path.length () - 1; i >= 1; --i)
3713         {
3714           gimple *stmt = USE_STMT (path[i].second);
3715           stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
3716           STMT_VINFO_REDUC_IDX (stmt_info)
3717             = path[i].second->use - gimple_assign_rhs1_ptr (stmt);
3718           enum tree_code stmt_code = gimple_assign_rhs_code (stmt);
3719           bool leading_conversion = (CONVERT_EXPR_CODE_P (stmt_code)
3720                                      && (i == 1 || i == path.length () - 1));
3721           if ((stmt_code != code && !leading_conversion)
3722               /* We can only handle the final value in epilogue
3723                  generation for reduction chains.  */
3724               || (i != 1 && !has_single_use (gimple_assign_lhs (stmt))))
3725             is_slp_reduc = false;
3726           /* For reduction chains we support a trailing/leading
3727              conversions.  We do not store those in the actual chain.  */
3728           if (leading_conversion)
3729             continue;
3730           reduc_chain.safe_push (stmt_info);
3731         }
3732       if (is_slp_reduc && reduc_chain.length () > 1)
3733         {
3734           for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
3735             {
3736               REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
3737               REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
3738             }
3739           REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
3740           REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
3741
3742           /* Save the chain for further analysis in SLP detection.  */
3743           LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
3744           REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
3745
3746           *reduc_chain_p = true;
3747           if (dump_enabled_p ())
3748             dump_printf_loc (MSG_NOTE, vect_location,
3749                             "reduction: detected reduction chain\n");
3750         }
3751       else if (dump_enabled_p ())
3752         dump_printf_loc (MSG_NOTE, vect_location,
3753                          "reduction: detected reduction\n");
3754
3755       return def_stmt_info;
3756     }
3757
3758   if (dump_enabled_p ())
3759     dump_printf_loc (MSG_NOTE, vect_location,
3760                      "reduction: unknown pattern\n");
3761
3762   return NULL;
3763 }
3764
3765 /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
3766    PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
3767    or -1 if not known.  */
3768
3769 static int
3770 vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
3771 {
3772   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3773   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
3774     {
3775       if (dump_enabled_p ())
3776         dump_printf_loc (MSG_NOTE, vect_location,
3777                          "cost model: epilogue peel iters set to vf/2 "
3778                          "because loop iterations are unknown .\n");
3779       return assumed_vf / 2;
3780     }
3781   else
3782     {
3783       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3784       peel_iters_prologue = MIN (niters, peel_iters_prologue);
3785       int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3786       /* If we need to peel for gaps, but no peeling is required, we have to
3787          peel VF iterations.  */
3788       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
3789         peel_iters_epilogue = assumed_vf;
3790       return peel_iters_epilogue;
3791     }
3792 }
3793
3794 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
3795 int
3796 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3797                              int *peel_iters_epilogue,
3798                              stmt_vector_for_cost *scalar_cost_vec,
3799                              stmt_vector_for_cost *prologue_cost_vec,
3800                              stmt_vector_for_cost *epilogue_cost_vec)
3801 {
3802   int retval = 0;
3803
3804   *peel_iters_epilogue
3805     = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
3806
3807   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3808     {
3809       /* If peeled iterations are known but number of scalar loop
3810          iterations are unknown, count a taken branch per peeled loop.  */
3811       if (peel_iters_prologue > 0)
3812         retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3813                                    NULL, NULL_TREE, 0, vect_prologue);
3814       if (*peel_iters_epilogue > 0)
3815         retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
3816                                     NULL, NULL_TREE, 0, vect_epilogue);
3817     }
3818
3819   stmt_info_for_cost *si;
3820   int j;
3821   if (peel_iters_prologue)
3822     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3823       retval += record_stmt_cost (prologue_cost_vec,
3824                                   si->count * peel_iters_prologue,
3825                                   si->kind, si->stmt_info, si->misalign,
3826                                   vect_prologue);
3827   if (*peel_iters_epilogue)
3828     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3829       retval += record_stmt_cost (epilogue_cost_vec,
3830                                   si->count * *peel_iters_epilogue,
3831                                   si->kind, si->stmt_info, si->misalign,
3832                                   vect_epilogue);
3833
3834   return retval;
3835 }
3836
3837 /* Function vect_estimate_min_profitable_iters
3838
3839    Return the number of iterations required for the vector version of the
3840    loop to be profitable relative to the cost of the scalar version of the
3841    loop.
3842
3843    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3844    of iterations for vectorization.  -1 value means loop vectorization
3845    is not profitable.  This returned value may be used for dynamic
3846    profitability check.
3847
3848    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3849    for static check against estimated number of iterations.  */
3850
3851 static void
3852 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3853                                     int *ret_min_profitable_niters,
3854                                     int *ret_min_profitable_estimate)
3855 {
3856   int min_profitable_iters;
3857   int min_profitable_estimate;
3858   int peel_iters_prologue;
3859   int peel_iters_epilogue;
3860   unsigned vec_inside_cost = 0;
3861   int vec_outside_cost = 0;
3862   unsigned vec_prologue_cost = 0;
3863   unsigned vec_epilogue_cost = 0;
3864   int scalar_single_iter_cost = 0;
3865   int scalar_outside_cost = 0;
3866   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3867   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3868   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3869
3870   /* Cost model disabled.  */
3871   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3872     {
3873       if (dump_enabled_p ())
3874         dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3875       *ret_min_profitable_niters = 0;
3876       *ret_min_profitable_estimate = 0;
3877       return;
3878     }
3879
3880   /* Requires loop versioning tests to handle misalignment.  */
3881   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3882     {
3883       /*  FIXME: Make cost depend on complexity of individual check.  */
3884       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3885       (void) add_stmt_cost (loop_vinfo, target_cost_data, len, vector_stmt,
3886                             NULL, NULL_TREE, 0, vect_prologue);
3887       if (dump_enabled_p ())
3888         dump_printf (MSG_NOTE,
3889                      "cost model: Adding cost of checks for loop "
3890                      "versioning to treat misalignment.\n");
3891     }
3892
3893   /* Requires loop versioning with alias checks.  */
3894   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3895     {
3896       /*  FIXME: Make cost depend on complexity of individual check.  */
3897       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3898       (void) add_stmt_cost (loop_vinfo, target_cost_data, len, vector_stmt,
3899                             NULL, NULL_TREE, 0, vect_prologue);
3900       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3901       if (len)
3902         /* Count LEN - 1 ANDs and LEN comparisons.  */
3903         (void) add_stmt_cost (loop_vinfo, target_cost_data, len * 2 - 1,
3904                               scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
3905       len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3906       if (len)
3907         {
3908           /* Count LEN - 1 ANDs and LEN comparisons.  */
3909           unsigned int nstmts = len * 2 - 1;
3910           /* +1 for each bias that needs adding.  */
3911           for (unsigned int i = 0; i < len; ++i)
3912             if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3913               nstmts += 1;
3914           (void) add_stmt_cost (loop_vinfo, target_cost_data, nstmts,
3915                                 scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
3916         }
3917       if (dump_enabled_p ())
3918         dump_printf (MSG_NOTE,
3919                      "cost model: Adding cost of checks for loop "
3920                      "versioning aliasing.\n");
3921     }
3922
3923   /* Requires loop versioning with niter checks.  */
3924   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3925     {
3926       /*  FIXME: Make cost depend on complexity of individual check.  */
3927       (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, vector_stmt,
3928                             NULL, NULL_TREE, 0, vect_prologue);
3929       if (dump_enabled_p ())
3930         dump_printf (MSG_NOTE,
3931                      "cost model: Adding cost of checks for loop "
3932                      "versioning niters.\n");
3933     }
3934
3935   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3936     (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
3937                           NULL, NULL_TREE, 0, vect_prologue);
3938
3939   /* Count statements in scalar loop.  Using this as scalar cost for a single
3940      iteration for now.
3941
3942      TODO: Add outer loop support.
3943
3944      TODO: Consider assigning different costs to different scalar
3945      statements.  */
3946
3947   scalar_single_iter_cost
3948     = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3949
3950   /* Add additional cost for the peeled instructions in prologue and epilogue
3951      loop.  (For fully-masked loops there will be no peeling.)
3952
3953      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3954      at compile-time - we assume it's vf/2 (the worst would be vf-1).
3955
3956      TODO: Build an expression that represents peel_iters for prologue and
3957      epilogue to be used in a run-time test.  */
3958
3959   bool prologue_need_br_taken_cost = false;
3960   bool prologue_need_br_not_taken_cost = false;
3961
3962   /* Calculate peel_iters_prologue.  */
3963   if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
3964     peel_iters_prologue = 0;
3965   else if (npeel < 0)
3966     {
3967       peel_iters_prologue = assumed_vf / 2;
3968       if (dump_enabled_p ())
3969         dump_printf (MSG_NOTE, "cost model: "
3970                      "prologue peel iters set to vf/2.\n");
3971
3972       /* If peeled iterations are unknown, count a taken branch and a not taken
3973          branch per peeled loop.  Even if scalar loop iterations are known,
3974          vector iterations are not known since peeled prologue iterations are
3975          not known.  Hence guards remain the same.  */
3976       prologue_need_br_taken_cost = true;
3977       prologue_need_br_not_taken_cost = true;
3978     }
3979   else
3980     {
3981       peel_iters_prologue = npeel;
3982       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
3983         /* If peeled iterations are known but number of scalar loop
3984            iterations are unknown, count a taken branch per peeled loop.  */
3985         prologue_need_br_taken_cost = true;
3986     }
3987
3988   bool epilogue_need_br_taken_cost = false;
3989   bool epilogue_need_br_not_taken_cost = false;
3990
3991   /* Calculate peel_iters_epilogue.  */
3992   if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3993     /* We need to peel exactly one iteration for gaps.  */
3994     peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
3995   else if (npeel < 0)
3996     {
3997       /* If peeling for alignment is unknown, loop bound of main loop
3998          becomes unknown.  */
3999       peel_iters_epilogue = assumed_vf / 2;
4000       if (dump_enabled_p ())
4001         dump_printf (MSG_NOTE, "cost model: "
4002                      "epilogue peel iters set to vf/2 because "
4003                      "peeling for alignment is unknown.\n");
4004
4005       /* See the same reason above in peel_iters_prologue calculation.  */
4006       epilogue_need_br_taken_cost = true;
4007       epilogue_need_br_not_taken_cost = true;
4008     }
4009   else
4010     {
4011       peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
4012       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
4013         /* If peeled iterations are known but number of scalar loop
4014            iterations are unknown, count a taken branch per peeled loop.  */
4015         epilogue_need_br_taken_cost = true;
4016     }
4017
4018   stmt_info_for_cost *si;
4019   int j;
4020   /* Add costs associated with peel_iters_prologue.  */
4021   if (peel_iters_prologue)
4022     FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4023       {
4024         (void) add_stmt_cost (loop_vinfo, target_cost_data,
4025                               si->count * peel_iters_prologue, si->kind,
4026                               si->stmt_info, si->vectype, si->misalign,
4027                               vect_prologue);
4028       }
4029
4030   /* Add costs associated with peel_iters_epilogue.  */
4031   if (peel_iters_epilogue)
4032     FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4033       {
4034         (void) add_stmt_cost (loop_vinfo, target_cost_data,
4035                               si->count * peel_iters_epilogue, si->kind,
4036                               si->stmt_info, si->vectype, si->misalign,
4037                               vect_epilogue);
4038       }
4039
4040   /* Add possible cond_branch_taken/cond_branch_not_taken cost.  */
4041
4042   if (prologue_need_br_taken_cost)
4043     (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
4044                           NULL, NULL_TREE, 0, vect_prologue);
4045
4046   if (prologue_need_br_not_taken_cost)
4047     (void) add_stmt_cost (loop_vinfo, target_cost_data, 1,
4048                           cond_branch_not_taken, NULL, NULL_TREE, 0,
4049                           vect_prologue);
4050
4051   if (epilogue_need_br_taken_cost)
4052     (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
4053                           NULL, NULL_TREE, 0, vect_epilogue);
4054
4055   if (epilogue_need_br_not_taken_cost)
4056     (void) add_stmt_cost (loop_vinfo, target_cost_data, 1,
4057                           cond_branch_not_taken, NULL, NULL_TREE, 0,
4058                           vect_epilogue);
4059
4060   /* Take care of special costs for rgroup controls of partial vectors.  */
4061   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
4062     {
4063       /* Calculate how many masks we need to generate.  */
4064       unsigned int num_masks = 0;
4065       rgroup_controls *rgm;
4066       unsigned int num_vectors_m1;
4067       FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), num_vectors_m1, rgm)
4068         if (rgm->type)
4069           num_masks += num_vectors_m1 + 1;
4070       gcc_assert (num_masks > 0);
4071
4072       /* In the worst case, we need to generate each mask in the prologue
4073          and in the loop body.  One of the loop body mask instructions
4074          replaces the comparison in the scalar loop, and since we don't
4075          count the scalar comparison against the scalar body, we shouldn't
4076          count that vector instruction against the vector body either.
4077
4078          Sometimes we can use unpacks instead of generating prologue
4079          masks and sometimes the prologue mask will fold to a constant,
4080          so the actual prologue cost might be smaller.  However, it's
4081          simpler and safer to use the worst-case cost; if this ends up
4082          being the tie-breaker between vectorizing or not, then it's
4083          probably better not to vectorize.  */
4084       (void) add_stmt_cost (loop_vinfo, target_cost_data, num_masks,
4085                             vector_stmt, NULL, NULL_TREE, 0, vect_prologue);
4086       (void) add_stmt_cost (loop_vinfo, target_cost_data, num_masks - 1,
4087                             vector_stmt, NULL, NULL_TREE, 0, vect_body);
4088     }
4089   else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
4090     {
4091       /* Referring to the functions vect_set_loop_condition_partial_vectors
4092          and vect_set_loop_controls_directly, we need to generate each
4093          length in the prologue and in the loop body if required. Although
4094          there are some possible optimizations, we consider the worst case
4095          here.  */
4096
4097       bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
4098       bool need_iterate_p
4099         = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
4100            && !vect_known_niters_smaller_than_vf (loop_vinfo));
4101
4102       /* Calculate how many statements to be added.  */
4103       unsigned int prologue_stmts = 0;
4104       unsigned int body_stmts = 0;
4105
4106       rgroup_controls *rgc;
4107       unsigned int num_vectors_m1;
4108       FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
4109         if (rgc->type)
4110           {
4111             /* May need one SHIFT for nitems_total computation.  */
4112             unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
4113             if (nitems != 1 && !niters_known_p)
4114               prologue_stmts += 1;
4115
4116             /* May need one MAX and one MINUS for wrap around.  */
4117             if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
4118               prologue_stmts += 2;
4119
4120             /* Need one MAX and one MINUS for each batch limit excepting for
4121                the 1st one.  */
4122             prologue_stmts += num_vectors_m1 * 2;
4123
4124             unsigned int num_vectors = num_vectors_m1 + 1;
4125
4126             /* Need to set up lengths in prologue, only one MIN required
4127                for each since start index is zero.  */
4128             prologue_stmts += num_vectors;
4129
4130             /* Each may need two MINs and one MINUS to update lengths in body
4131                for next iteration.  */
4132             if (need_iterate_p)
4133               body_stmts += 3 * num_vectors;
4134           }
4135
4136       (void) add_stmt_cost (loop_vinfo, target_cost_data, prologue_stmts,
4137                             scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
4138       (void) add_stmt_cost (loop_vinfo, target_cost_data, body_stmts,
4139                             scalar_stmt, NULL, NULL_TREE, 0, vect_body);
4140     }
4141
4142   /* FORNOW: The scalar outside cost is incremented in one of the
4143      following ways:
4144
4145      1. The vectorizer checks for alignment and aliasing and generates
4146      a condition that allows dynamic vectorization.  A cost model
4147      check is ANDED with the versioning condition.  Hence scalar code
4148      path now has the added cost of the versioning check.
4149
4150        if (cost > th & versioning_check)
4151          jmp to vector code
4152
4153      Hence run-time scalar is incremented by not-taken branch cost.
4154
4155      2. The vectorizer then checks if a prologue is required.  If the
4156      cost model check was not done before during versioning, it has to
4157      be done before the prologue check.
4158
4159        if (cost <= th)
4160          prologue = scalar_iters
4161        if (prologue == 0)
4162          jmp to vector code
4163        else
4164          execute prologue
4165        if (prologue == num_iters)
4166          go to exit
4167
4168      Hence the run-time scalar cost is incremented by a taken branch,
4169      plus a not-taken branch, plus a taken branch cost.
4170
4171      3. The vectorizer then checks if an epilogue is required.  If the
4172      cost model check was not done before during prologue check, it
4173      has to be done with the epilogue check.
4174
4175        if (prologue == 0)
4176          jmp to vector code
4177        else
4178          execute prologue
4179        if (prologue == num_iters)
4180          go to exit
4181        vector code:
4182          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
4183            jmp to epilogue
4184
4185      Hence the run-time scalar cost should be incremented by 2 taken
4186      branches.
4187
4188      TODO: The back end may reorder the BBS's differently and reverse
4189      conditions/branch directions.  Change the estimates below to
4190      something more reasonable.  */
4191
4192   /* If the number of iterations is known and we do not do versioning, we can
4193      decide whether to vectorize at compile time.  Hence the scalar version
4194      do not carry cost model guard costs.  */
4195   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4196       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
4197     {
4198       /* Cost model check occurs at versioning.  */
4199       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4200         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
4201       else
4202         {
4203           /* Cost model check occurs at prologue generation.  */
4204           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
4205             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
4206               + vect_get_stmt_cost (cond_branch_not_taken);
4207           /* Cost model check occurs at epilogue generation.  */
4208           else
4209             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
4210         }
4211     }
4212
4213   /* Complete the target-specific cost calculations.  */
4214   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
4215                &vec_inside_cost, &vec_epilogue_cost);
4216
4217   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
4218
4219   /* Stash the costs so that we can compare two loop_vec_infos.  */
4220   loop_vinfo->vec_inside_cost = vec_inside_cost;
4221   loop_vinfo->vec_outside_cost = vec_outside_cost;
4222
4223   if (dump_enabled_p ())
4224     {
4225       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
4226       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
4227                    vec_inside_cost);
4228       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
4229                    vec_prologue_cost);
4230       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
4231                    vec_epilogue_cost);
4232       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
4233                    scalar_single_iter_cost);
4234       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
4235                    scalar_outside_cost);
4236       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
4237                    vec_outside_cost);
4238       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
4239                    peel_iters_prologue);
4240       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
4241                    peel_iters_epilogue);
4242     }
4243
4244   /* Calculate number of iterations required to make the vector version
4245      profitable, relative to the loop bodies only.  The following condition
4246      must hold true:
4247      SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
4248      where
4249      SIC = scalar iteration cost, VIC = vector iteration cost,
4250      VOC = vector outside cost, VF = vectorization factor,
4251      NPEEL = prologue iterations + epilogue iterations,
4252      SOC = scalar outside cost for run time cost model check.  */
4253
4254   int saving_per_viter = (scalar_single_iter_cost * assumed_vf
4255                           - vec_inside_cost);
4256   if (saving_per_viter <= 0)
4257     {
4258       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
4259         warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
4260                     "vectorization did not happen for a simd loop");
4261
4262       if (dump_enabled_p ())
4263         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4264                          "cost model: the vector iteration cost = %d "
4265                          "divided by the scalar iteration cost = %d "
4266                          "is greater or equal to the vectorization factor = %d"
4267                          ".\n",
4268                          vec_inside_cost, scalar_single_iter_cost, assumed_vf);
4269       *ret_min_profitable_niters = -1;
4270       *ret_min_profitable_estimate = -1;
4271       return;
4272     }
4273
4274   /* ??? The "if" arm is written to handle all cases; see below for what
4275      we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
4276   if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4277     {
4278       /* Rewriting the condition above in terms of the number of
4279          vector iterations (vniters) rather than the number of
4280          scalar iterations (niters) gives:
4281
4282          SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
4283
4284          <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
4285
4286          For integer N, X and Y when X > 0:
4287
4288          N * X > Y <==> N >= (Y /[floor] X) + 1.  */
4289       int outside_overhead = (vec_outside_cost
4290                               - scalar_single_iter_cost * peel_iters_prologue
4291                               - scalar_single_iter_cost * peel_iters_epilogue
4292                               - scalar_outside_cost);
4293       /* We're only interested in cases that require at least one
4294          vector iteration.  */
4295       int min_vec_niters = 1;
4296       if (outside_overhead > 0)
4297         min_vec_niters = outside_overhead / saving_per_viter + 1;
4298
4299       if (dump_enabled_p ())
4300         dump_printf (MSG_NOTE, "  Minimum number of vector iterations: %d\n",
4301                      min_vec_niters);
4302
4303       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4304         {
4305           /* Now that we know the minimum number of vector iterations,
4306              find the minimum niters for which the scalar cost is larger:
4307
4308              SIC * niters > VIC * vniters + VOC - SOC
4309
4310              We know that the minimum niters is no more than
4311              vniters * VF + NPEEL, but it might be (and often is) less
4312              than that if a partial vector iteration is cheaper than the
4313              equivalent scalar code.  */
4314           int threshold = (vec_inside_cost * min_vec_niters
4315                            + vec_outside_cost
4316                            - scalar_outside_cost);
4317           if (threshold <= 0)
4318             min_profitable_iters = 1;
4319           else
4320             min_profitable_iters = threshold / scalar_single_iter_cost + 1;
4321         }
4322       else
4323         /* Convert the number of vector iterations into a number of
4324            scalar iterations.  */
4325         min_profitable_iters = (min_vec_niters * assumed_vf
4326                                 + peel_iters_prologue
4327                                 + peel_iters_epilogue);
4328     }
4329   else
4330     {
4331       min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
4332                               * assumed_vf
4333                               - vec_inside_cost * peel_iters_prologue
4334                               - vec_inside_cost * peel_iters_epilogue);
4335       if (min_profitable_iters <= 0)
4336         min_profitable_iters = 0;
4337       else
4338         {
4339           min_profitable_iters /= saving_per_viter;
4340
4341           if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
4342               <= (((int) vec_inside_cost * min_profitable_iters)
4343                   + (((int) vec_outside_cost - scalar_outside_cost)
4344                      * assumed_vf)))
4345             min_profitable_iters++;
4346         }
4347     }
4348
4349   if (dump_enabled_p ())
4350     dump_printf (MSG_NOTE,
4351                  "  Calculated minimum iters for profitability: %d\n",
4352                  min_profitable_iters);
4353
4354   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
4355       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
4356     /* We want the vectorized loop to execute at least once.  */
4357     min_profitable_iters = assumed_vf + peel_iters_prologue;
4358   else if (min_profitable_iters < peel_iters_prologue)
4359     /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
4360        vectorized loop executes at least once.  */
4361     min_profitable_iters = peel_iters_prologue;
4362
4363   if (dump_enabled_p ())
4364     dump_printf_loc (MSG_NOTE, vect_location,
4365                      "  Runtime profitability threshold = %d\n",
4366                      min_profitable_iters);
4367
4368   *ret_min_profitable_niters = min_profitable_iters;
4369
4370   /* Calculate number of iterations required to make the vector version
4371      profitable, relative to the loop bodies only.
4372
4373      Non-vectorized variant is SIC * niters and it must win over vector
4374      variant on the expected loop trip count.  The following condition must hold true:
4375      SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC  */
4376
4377   if (vec_outside_cost <= 0)
4378     min_profitable_estimate = 0;
4379   /* ??? This "else if" arm is written to handle all cases; see below for
4380      what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
4381   else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4382     {
4383       /* This is a repeat of the code above, but with + SOC rather
4384          than - SOC.  */
4385       int outside_overhead = (vec_outside_cost
4386                               - scalar_single_iter_cost * peel_iters_prologue
4387                               - scalar_single_iter_cost * peel_iters_epilogue
4388                               + scalar_outside_cost);
4389       int min_vec_niters = 1;
4390       if (outside_overhead > 0)
4391         min_vec_niters = outside_overhead / saving_per_viter + 1;
4392
4393       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4394         {
4395           int threshold = (vec_inside_cost * min_vec_niters
4396                            + vec_outside_cost
4397                            + scalar_outside_cost);
4398           min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
4399         }
4400       else
4401         min_profitable_estimate = (min_vec_niters * assumed_vf
4402                                    + peel_iters_prologue
4403                                    + peel_iters_epilogue);
4404     }
4405   else
4406     {
4407       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
4408                                  * assumed_vf
4409                                  - vec_inside_cost * peel_iters_prologue
4410                                  - vec_inside_cost * peel_iters_epilogue)
4411                                  / ((scalar_single_iter_cost * assumed_vf)
4412                                    - vec_inside_cost);
4413     }
4414   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
4415   if (dump_enabled_p ())
4416     dump_printf_loc (MSG_NOTE, vect_location,
4417                      "  Static estimate profitability threshold = %d\n",
4418                      min_profitable_estimate);
4419
4420   *ret_min_profitable_estimate = min_profitable_estimate;
4421 }
4422
4423 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
4424    vector elements (not bits) for a vector with NELT elements.  */
4425 static void
4426 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
4427                               vec_perm_builder *sel)
4428 {
4429   /* The encoding is a single stepped pattern.  Any wrap-around is handled
4430      by vec_perm_indices.  */
4431   sel->new_vector (nelt, 1, 3);
4432   for (unsigned int i = 0; i < 3; i++)
4433     sel->quick_push (i + offset);
4434 }
4435
4436 /* Checks whether the target supports whole-vector shifts for vectors of mode
4437    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
4438    it supports vec_perm_const with masks for all necessary shift amounts.  */
4439 static bool
4440 have_whole_vector_shift (machine_mode mode)
4441 {
4442   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
4443     return true;
4444
4445   /* Variable-length vectors should be handled via the optab.  */
4446   unsigned int nelt;
4447   if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
4448     return false;
4449
4450   vec_perm_builder sel;
4451   vec_perm_indices indices;
4452   for (unsigned int i = nelt / 2; i >= 1; i /= 2)
4453     {
4454       calc_vec_perm_mask_for_shift (i, nelt, &sel);
4455       indices.new_vector (sel, 2, nelt);
4456       if (!can_vec_perm_const_p (mode, indices, false))
4457         return false;
4458     }
4459   return true;
4460 }
4461
4462 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
4463    functions. Design better to avoid maintenance issues.  */
4464
4465 /* Function vect_model_reduction_cost.
4466
4467    Models cost for a reduction operation, including the vector ops
4468    generated within the strip-mine loop in some cases, the initial
4469    definition before the loop, and the epilogue code that must be generated.  */
4470
4471 static void
4472 vect_model_reduction_cost (loop_vec_info loop_vinfo,
4473                            stmt_vec_info stmt_info, internal_fn reduc_fn,
4474                            vect_reduction_type reduction_type,
4475                            int ncopies, stmt_vector_for_cost *cost_vec)
4476 {
4477   int prologue_cost = 0, epilogue_cost = 0, inside_cost = 0;
4478   enum tree_code code;
4479   optab optab;
4480   tree vectype;
4481   machine_mode mode;
4482   class loop *loop = NULL;
4483
4484   if (loop_vinfo)
4485     loop = LOOP_VINFO_LOOP (loop_vinfo);
4486
4487   /* Condition reductions generate two reductions in the loop.  */
4488   if (reduction_type == COND_REDUCTION)
4489     ncopies *= 2;
4490
4491   vectype = STMT_VINFO_VECTYPE (stmt_info);
4492   mode = TYPE_MODE (vectype);
4493   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4494
4495   code = gimple_assign_rhs_code (orig_stmt_info->stmt);
4496
4497   if (reduction_type == EXTRACT_LAST_REDUCTION)
4498     /* No extra instructions are needed in the prologue.  The loop body
4499        operations are costed in vectorizable_condition.  */
4500     inside_cost = 0;
4501   else if (reduction_type == FOLD_LEFT_REDUCTION)
4502     {
4503       /* No extra instructions needed in the prologue.  */
4504       prologue_cost = 0;
4505
4506       if (reduc_fn != IFN_LAST)
4507         /* Count one reduction-like operation per vector.  */
4508         inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
4509                                         stmt_info, 0, vect_body);
4510       else
4511         {
4512           /* Use NELEMENTS extracts and NELEMENTS scalar ops.  */
4513           unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
4514           inside_cost = record_stmt_cost (cost_vec, nelements,
4515                                           vec_to_scalar, stmt_info, 0,
4516                                           vect_body);
4517           inside_cost += record_stmt_cost (cost_vec, nelements,
4518                                            scalar_stmt, stmt_info, 0,
4519                                            vect_body);
4520         }
4521     }
4522   else
4523     {
4524       /* Add in cost for initial definition.
4525          For cond reduction we have four vectors: initial index, step,
4526          initial result of the data reduction, initial value of the index
4527          reduction.  */
4528       int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
4529       prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
4530                                          scalar_to_vec, stmt_info, 0,
4531                                          vect_prologue);
4532     }
4533
4534   /* Determine cost of epilogue code.
4535
4536      We have a reduction operator that will reduce the vector in one statement.
4537      Also requires scalar extract.  */
4538
4539   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
4540     {
4541       if (reduc_fn != IFN_LAST)
4542         {
4543           if (reduction_type == COND_REDUCTION)
4544             {
4545               /* An EQ stmt and an COND_EXPR stmt.  */
4546               epilogue_cost += record_stmt_cost (cost_vec, 2,
4547                                                  vector_stmt, stmt_info, 0,
4548                                                  vect_epilogue);
4549               /* Reduction of the max index and a reduction of the found
4550                  values.  */
4551               epilogue_cost += record_stmt_cost (cost_vec, 2,
4552                                                  vec_to_scalar, stmt_info, 0,
4553                                                  vect_epilogue);
4554               /* A broadcast of the max value.  */
4555               epilogue_cost += record_stmt_cost (cost_vec, 1,
4556                                                  scalar_to_vec, stmt_info, 0,
4557                                                  vect_epilogue);
4558             }
4559           else
4560             {
4561               epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
4562                                                  stmt_info, 0, vect_epilogue);
4563               epilogue_cost += record_stmt_cost (cost_vec, 1,
4564                                                  vec_to_scalar, stmt_info, 0,
4565                                                  vect_epilogue);
4566             }
4567         }
4568       else if (reduction_type == COND_REDUCTION)
4569         {
4570           unsigned estimated_nunits = vect_nunits_for_cost (vectype);
4571           /* Extraction of scalar elements.  */
4572           epilogue_cost += record_stmt_cost (cost_vec,
4573                                              2 * estimated_nunits,
4574                                              vec_to_scalar, stmt_info, 0,
4575                                              vect_epilogue);
4576           /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
4577           epilogue_cost += record_stmt_cost (cost_vec,
4578                                              2 * estimated_nunits - 3,
4579                                              scalar_stmt, stmt_info, 0,
4580                                              vect_epilogue);
4581         }
4582       else if (reduction_type == EXTRACT_LAST_REDUCTION
4583                || reduction_type == FOLD_LEFT_REDUCTION)
4584         /* No extra instructions need in the epilogue.  */
4585         ;
4586       else
4587         {
4588           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4589           tree bitsize =
4590             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt_info->stmt)));
4591           int element_bitsize = tree_to_uhwi (bitsize);
4592           int nelements = vec_size_in_bits / element_bitsize;
4593
4594           if (code == COND_EXPR)
4595             code = MAX_EXPR;
4596
4597           optab = optab_for_tree_code (code, vectype, optab_default);
4598
4599           /* We have a whole vector shift available.  */
4600           if (optab != unknown_optab
4601               && VECTOR_MODE_P (mode)
4602               && optab_handler (optab, mode) != CODE_FOR_nothing
4603               && have_whole_vector_shift (mode))
4604             {
4605               /* Final reduction via vector shifts and the reduction operator.
4606                  Also requires scalar extract.  */
4607               epilogue_cost += record_stmt_cost (cost_vec,
4608                                                  exact_log2 (nelements) * 2,
4609                                                  vector_stmt, stmt_info, 0,
4610                                                  vect_epilogue);
4611               epilogue_cost += record_stmt_cost (cost_vec, 1,
4612                                                  vec_to_scalar, stmt_info, 0,
4613                                                  vect_epilogue);
4614             }
4615           else
4616             /* Use extracts and reduction op for final reduction.  For N
4617                elements, we have N extracts and N-1 reduction ops.  */
4618             epilogue_cost += record_stmt_cost (cost_vec,
4619                                                nelements + nelements - 1,
4620                                                vector_stmt, stmt_info, 0,
4621                                                vect_epilogue);
4622         }
4623     }
4624
4625   if (dump_enabled_p ())
4626     dump_printf (MSG_NOTE,
4627                  "vect_model_reduction_cost: inside_cost = %d, "
4628                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4629                  prologue_cost, epilogue_cost);
4630 }
4631
4632 /* SEQ is a sequence of instructions that initialize the reduction
4633    described by REDUC_INFO.  Emit them in the appropriate place.  */
4634
4635 static void
4636 vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo,
4637                                 stmt_vec_info reduc_info, gimple *seq)
4638 {
4639   if (reduc_info->reused_accumulator)
4640     {
4641       /* When reusing an accumulator from the main loop, we only need
4642          initialization instructions if the main loop can be skipped.
4643          In that case, emit the initialization instructions at the end
4644          of the guard block that does the skip.  */
4645       edge skip_edge = loop_vinfo->skip_main_loop_edge;
4646       gcc_assert (skip_edge);
4647       gimple_stmt_iterator gsi = gsi_last_bb (skip_edge->src);
4648       gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
4649     }
4650   else
4651     {
4652       /* The normal case: emit the initialization instructions on the
4653          preheader edge.  */
4654       class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4655       gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), seq);
4656     }
4657 }
4658
4659 /* Function get_initial_def_for_reduction
4660
4661    Input:
4662    REDUC_INFO - the info_for_reduction
4663    INIT_VAL - the initial value of the reduction variable
4664    NEUTRAL_OP - a value that has no effect on the reduction, as per
4665                 neutral_op_for_reduction
4666
4667    Output:
4668    Return a vector variable, initialized according to the operation that
4669         STMT_VINFO performs. This vector will be used as the initial value
4670         of the vector of partial results.
4671
4672    The value we need is a vector in which element 0 has value INIT_VAL
4673    and every other element has value NEUTRAL_OP.  */
4674
4675 static tree
4676 get_initial_def_for_reduction (loop_vec_info loop_vinfo,
4677                                stmt_vec_info reduc_info,
4678                                tree init_val, tree neutral_op)
4679 {
4680   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4681   tree scalar_type = TREE_TYPE (init_val);
4682   tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
4683   tree init_def;
4684   gimple_seq stmts = NULL;
4685
4686   gcc_assert (vectype);
4687
4688   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4689               || SCALAR_FLOAT_TYPE_P (scalar_type));
4690
4691   gcc_assert (nested_in_vect_loop_p (loop, reduc_info)
4692               || loop == (gimple_bb (reduc_info->stmt))->loop_father);
4693
4694   if (operand_equal_p (init_val, neutral_op))
4695     {
4696       /* If both elements are equal then the vector described above is
4697          just a splat.  */
4698       neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
4699       init_def = gimple_build_vector_from_val (&stmts, vectype, neutral_op);
4700     }
4701   else
4702     {
4703       neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
4704       init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4705       if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4706         {
4707           /* Construct a splat of NEUTRAL_OP and insert INIT_VAL into
4708              element 0.  */
4709           init_def = gimple_build_vector_from_val (&stmts, vectype,
4710                                                    neutral_op);
4711           init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4712                                    vectype, init_def, init_val);
4713         }
4714       else
4715         {
4716           /* Build {INIT_VAL, NEUTRAL_OP, NEUTRAL_OP, ...}.  */
4717           tree_vector_builder elts (vectype, 1, 2);
4718           elts.quick_push (init_val);
4719           elts.quick_push (neutral_op);
4720           init_def = gimple_build_vector (&stmts, &elts);
4721         }
4722     }
4723
4724   if (stmts)
4725     vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, stmts);
4726   return init_def;
4727 }
4728
4729 /* Get at the initial defs for the reduction PHIs for REDUC_INFO,
4730    which performs a reduction involving GROUP_SIZE scalar statements.
4731    NUMBER_OF_VECTORS is the number of vector defs to create.  If NEUTRAL_OP
4732    is nonnull, introducing extra elements of that value will not change the
4733    result.  */
4734
4735 static void
4736 get_initial_defs_for_reduction (loop_vec_info loop_vinfo,
4737                                 stmt_vec_info reduc_info,
4738                                 vec<tree> *vec_oprnds,
4739                                 unsigned int number_of_vectors,
4740                                 unsigned int group_size, tree neutral_op)
4741 {
4742   vec<tree> &initial_values = reduc_info->reduc_initial_values;
4743   unsigned HOST_WIDE_INT nunits;
4744   unsigned j, number_of_places_left_in_vector;
4745   tree vector_type = STMT_VINFO_VECTYPE (reduc_info);
4746   unsigned int i;
4747
4748   gcc_assert (group_size == initial_values.length () || neutral_op);
4749
4750   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4751      created vectors. It is greater than 1 if unrolling is performed.
4752
4753      For example, we have two scalar operands, s1 and s2 (e.g., group of
4754      strided accesses of size two), while NUNITS is four (i.e., four scalars
4755      of this type can be packed in a vector).  The output vector will contain
4756      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
4757      will be 2).
4758
4759      If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4760      vectors containing the operands.
4761
4762      For example, NUNITS is four as before, and the group size is 8
4763      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
4764      {s5, s6, s7, s8}.  */
4765
4766   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4767     nunits = group_size;
4768
4769   number_of_places_left_in_vector = nunits;
4770   bool constant_p = true;
4771   tree_vector_builder elts (vector_type, nunits, 1);
4772   elts.quick_grow (nunits);
4773   gimple_seq ctor_seq = NULL;
4774   for (j = 0; j < nunits * number_of_vectors; ++j)
4775     {
4776       tree op;
4777       i = j % group_size;
4778
4779       /* Get the def before the loop.  In reduction chain we have only
4780          one initial value.  Else we have as many as PHIs in the group.  */
4781       if (i >= initial_values.length () || (j > i && neutral_op))
4782         op = neutral_op;
4783       else
4784         op = initial_values[i];
4785
4786       /* Create 'vect_ = {op0,op1,...,opn}'.  */
4787       number_of_places_left_in_vector--;
4788       elts[nunits - number_of_places_left_in_vector - 1] = op;
4789       if (!CONSTANT_CLASS_P (op))
4790         constant_p = false;
4791
4792       if (number_of_places_left_in_vector == 0)
4793         {
4794           tree init;
4795           if (constant_p && !neutral_op
4796               ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4797               : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4798             /* Build the vector directly from ELTS.  */
4799             init = gimple_build_vector (&ctor_seq, &elts);
4800           else if (neutral_op)
4801             {
4802               /* Build a vector of the neutral value and shift the
4803                  other elements into place.  */
4804               init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4805                                                    neutral_op);
4806               int k = nunits;
4807               while (k > 0 && elts[k - 1] == neutral_op)
4808                 k -= 1;
4809               while (k > 0)
4810                 {
4811                   k -= 1;
4812                   init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4813                                        vector_type, init, elts[k]);
4814                 }
4815             }
4816           else
4817             {
4818               /* First time round, duplicate ELTS to fill the
4819                  required number of vectors.  */
4820               duplicate_and_interleave (loop_vinfo, &ctor_seq, vector_type,
4821                                         elts, number_of_vectors, *vec_oprnds);
4822               break;
4823             }
4824           vec_oprnds->quick_push (init);
4825
4826           number_of_places_left_in_vector = nunits;
4827           elts.new_vector (vector_type, nunits, 1);
4828           elts.quick_grow (nunits);
4829           constant_p = true;
4830         }
4831     }
4832   if (ctor_seq != NULL)
4833     vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, ctor_seq);
4834 }
4835
4836 /* For a statement STMT_INFO taking part in a reduction operation return
4837    the stmt_vec_info the meta information is stored on.  */
4838
4839 stmt_vec_info
4840 info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
4841 {
4842   stmt_info = vect_orig_stmt (stmt_info);
4843   gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
4844   if (!is_a <gphi *> (stmt_info->stmt)
4845       || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
4846     stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4847   gphi *phi = as_a <gphi *> (stmt_info->stmt);
4848   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
4849     {
4850       if (gimple_phi_num_args (phi) == 1)
4851         stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4852     }
4853   else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
4854     {
4855       stmt_vec_info info = vinfo->lookup_def (vect_phi_initial_value (phi));
4856       if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
4857         stmt_info = info;
4858     }
4859   return stmt_info;
4860 }
4861
4862 /* See if LOOP_VINFO is an epilogue loop whose main loop had a reduction that
4863    REDUC_INFO can build on.  Adjust REDUC_INFO and return true if so, otherwise
4864    return false.  */
4865
4866 static bool
4867 vect_find_reusable_accumulator (loop_vec_info loop_vinfo,
4868                                 stmt_vec_info reduc_info)
4869 {
4870   loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
4871   if (!main_loop_vinfo)
4872     return false;
4873
4874   if (STMT_VINFO_REDUC_TYPE (reduc_info) != TREE_CODE_REDUCTION)
4875     return false;
4876
4877   unsigned int num_phis = reduc_info->reduc_initial_values.length ();
4878   auto_vec<tree, 16> main_loop_results (num_phis);
4879   auto_vec<tree, 16> initial_values (num_phis);
4880   if (edge main_loop_edge = loop_vinfo->main_loop_edge)
4881     {
4882       /* The epilogue loop can be entered either from the main loop or
4883          from an earlier guard block.  */
4884       edge skip_edge = loop_vinfo->skip_main_loop_edge;
4885       for (tree incoming_value : reduc_info->reduc_initial_values)
4886         {
4887           /* Look for:
4888
4889                INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop),
4890                                     INITIAL_VALUE(guard block)>.  */
4891           gcc_assert (TREE_CODE (incoming_value) == SSA_NAME);
4892
4893           gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value));
4894           gcc_assert (gimple_bb (phi) == main_loop_edge->dest);
4895
4896           tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge);
4897           tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge);
4898
4899           main_loop_results.quick_push (from_main_loop);
4900           initial_values.quick_push (from_skip);
4901         }
4902     }
4903   else
4904     /* The main loop dominates the epilogue loop.  */
4905     main_loop_results.splice (reduc_info->reduc_initial_values);
4906
4907   /* See if the main loop has the kind of accumulator we need.  */
4908   vect_reusable_accumulator *accumulator
4909     = main_loop_vinfo->reusable_accumulators.get (main_loop_results[0]);
4910   if (!accumulator
4911       || num_phis != accumulator->reduc_info->reduc_scalar_results.length ()
4912       || !std::equal (main_loop_results.begin (), main_loop_results.end (),
4913                       accumulator->reduc_info->reduc_scalar_results.begin ()))
4914     return false;
4915
4916   /* Handle the case where we can reduce wider vectors to narrower ones.  */
4917   tree vectype = STMT_VINFO_VECTYPE (reduc_info);
4918   tree old_vectype = TREE_TYPE (accumulator->reduc_input);
4919   if (!constant_multiple_p (TYPE_VECTOR_SUBPARTS (old_vectype),
4920                             TYPE_VECTOR_SUBPARTS (vectype)))
4921     return false;
4922
4923   /* Non-SLP reductions might apply an adjustment after the reduction
4924      operation, in order to simplify the initialization of the accumulator.
4925      If the epilogue loop carries on from where the main loop left off,
4926      it should apply the same adjustment to the final reduction result.
4927
4928      If the epilogue loop can also be entered directly (rather than via
4929      the main loop), we need to be able to handle that case in the same way,
4930      with the same adjustment.  (In principle we could add a PHI node
4931      to select the correct adjustment, but in practice that shouldn't be
4932      necessary.)  */
4933   tree main_adjustment
4934     = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (accumulator->reduc_info);
4935   if (loop_vinfo->main_loop_edge && main_adjustment)
4936     {
4937       gcc_assert (num_phis == 1);
4938       tree initial_value = initial_values[0];
4939       /* Check that we can use INITIAL_VALUE as the adjustment and
4940          initialize the accumulator with a neutral value instead.  */
4941       if (!operand_equal_p (initial_value, main_adjustment))
4942         return false;
4943       tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
4944       initial_values[0] = neutral_op_for_reduction (TREE_TYPE (initial_value),
4945                                                     code, initial_value);
4946     }
4947   STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = main_adjustment;
4948   reduc_info->reduc_initial_values.truncate (0);
4949   reduc_info->reduc_initial_values.splice (initial_values);
4950   reduc_info->reused_accumulator = accumulator;
4951   return true;
4952 }
4953
4954 /* Reduce the vector VEC_DEF down to VECTYPE with reduction operation
4955    CODE emitting stmts before GSI.  Returns a vector def of VECTYPE.  */
4956
4957 static tree
4958 vect_create_partial_epilog (tree vec_def, tree vectype, enum tree_code code,
4959                             gimple_seq *seq)
4960 {
4961   unsigned nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (vec_def)).to_constant ();
4962   unsigned nunits1 = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
4963   tree stype = TREE_TYPE (vectype);
4964   tree new_temp = vec_def;
4965   while (nunits > nunits1)
4966     {
4967       nunits /= 2;
4968       tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
4969                                                            stype, nunits);
4970       unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
4971
4972       /* The target has to make sure we support lowpart/highpart
4973          extraction, either via direct vector extract or through
4974          an integer mode punning.  */
4975       tree dst1, dst2;
4976       gimple *epilog_stmt;
4977       if (convert_optab_handler (vec_extract_optab,
4978                                  TYPE_MODE (TREE_TYPE (new_temp)),
4979                                  TYPE_MODE (vectype1))
4980           != CODE_FOR_nothing)
4981         {
4982           /* Extract sub-vectors directly once vec_extract becomes
4983              a conversion optab.  */
4984           dst1 = make_ssa_name (vectype1);
4985           epilog_stmt
4986               = gimple_build_assign (dst1, BIT_FIELD_REF,
4987                                      build3 (BIT_FIELD_REF, vectype1,
4988                                              new_temp, TYPE_SIZE (vectype1),
4989                                              bitsize_int (0)));
4990           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
4991           dst2 =  make_ssa_name (vectype1);
4992           epilog_stmt
4993               = gimple_build_assign (dst2, BIT_FIELD_REF,
4994                                      build3 (BIT_FIELD_REF, vectype1,
4995                                              new_temp, TYPE_SIZE (vectype1),
4996                                              bitsize_int (bitsize)));
4997           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
4998         }
4999       else
5000         {
5001           /* Extract via punning to appropriately sized integer mode
5002              vector.  */
5003           tree eltype = build_nonstandard_integer_type (bitsize, 1);
5004           tree etype = build_vector_type (eltype, 2);
5005           gcc_assert (convert_optab_handler (vec_extract_optab,
5006                                              TYPE_MODE (etype),
5007                                              TYPE_MODE (eltype))
5008                       != CODE_FOR_nothing);
5009           tree tem = make_ssa_name (etype);
5010           epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5011                                              build1 (VIEW_CONVERT_EXPR,
5012                                                      etype, new_temp));
5013           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5014           new_temp = tem;
5015           tem = make_ssa_name (eltype);
5016           epilog_stmt
5017               = gimple_build_assign (tem, BIT_FIELD_REF,
5018                                      build3 (BIT_FIELD_REF, eltype,
5019                                              new_temp, TYPE_SIZE (eltype),
5020                                              bitsize_int (0)));
5021           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5022           dst1 = make_ssa_name (vectype1);
5023           epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5024                                              build1 (VIEW_CONVERT_EXPR,
5025                                                      vectype1, tem));
5026           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5027           tem = make_ssa_name (eltype);
5028           epilog_stmt
5029               = gimple_build_assign (tem, BIT_FIELD_REF,
5030                                      build3 (BIT_FIELD_REF, eltype,
5031                                              new_temp, TYPE_SIZE (eltype),
5032                                              bitsize_int (bitsize)));
5033           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5034           dst2 =  make_ssa_name (vectype1);
5035           epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5036                                              build1 (VIEW_CONVERT_EXPR,
5037                                                      vectype1, tem));
5038           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5039         }
5040
5041       new_temp = make_ssa_name (vectype1);
5042       epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5043       gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5044     }
5045
5046   return new_temp;
5047 }
5048
5049 /* Function vect_create_epilog_for_reduction
5050
5051    Create code at the loop-epilog to finalize the result of a reduction
5052    computation.
5053
5054    STMT_INFO is the scalar reduction stmt that is being vectorized.
5055    SLP_NODE is an SLP node containing a group of reduction statements. The
5056      first one in this group is STMT_INFO.
5057    SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
5058    REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
5059      (counting from 0)
5060
5061    This function:
5062    1. Completes the reduction def-use cycles.
5063    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
5064       by calling the function specified by REDUC_FN if available, or by
5065       other means (whole-vector shifts or a scalar loop).
5066       The function also creates a new phi node at the loop exit to preserve
5067       loop-closed form, as illustrated below.
5068
5069      The flow at the entry to this function:
5070
5071         loop:
5072           vec_def = phi <vec_init, null>        # REDUCTION_PHI
5073           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
5074           s_loop = scalar_stmt                  # (scalar) STMT_INFO
5075         loop_exit:
5076           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5077           use <s_out0>
5078           use <s_out0>
5079
5080      The above is transformed by this function into:
5081
5082         loop:
5083           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
5084           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
5085           s_loop = scalar_stmt                  # (scalar) STMT_INFO
5086         loop_exit:
5087           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5088           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5089           v_out2 = reduce <v_out1>
5090           s_out3 = extract_field <v_out2, 0>
5091           s_out4 = adjust_result <s_out3>
5092           use <s_out4>
5093           use <s_out4>
5094 */
5095
5096 static void
5097 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
5098                                   stmt_vec_info stmt_info,
5099                                   slp_tree slp_node,
5100                                   slp_instance slp_node_instance)
5101 {
5102   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5103   gcc_assert (reduc_info->is_reduc_info);
5104   /* For double reductions we need to get at the inner loop reduction
5105      stmt which has the meta info attached.  Our stmt_info is that of the
5106      loop-closed PHI of the inner loop which we remember as
5107      def for the reduction PHI generation.  */
5108   bool double_reduc = false;
5109   stmt_vec_info rdef_info = stmt_info;
5110   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5111     {
5112       gcc_assert (!slp_node);
5113       double_reduc = true;
5114       stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
5115                                             (stmt_info->stmt, 0));
5116       stmt_info = vect_stmt_to_vectorize (stmt_info);
5117     }
5118   gphi *reduc_def_stmt
5119     = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
5120   enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
5121   internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
5122   tree vectype;
5123   machine_mode mode;
5124   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
5125   basic_block exit_bb;
5126   tree scalar_dest;
5127   tree scalar_type;
5128   gimple *new_phi = NULL, *phi;
5129   gimple_stmt_iterator exit_gsi;
5130   tree new_temp = NULL_TREE, new_name, new_scalar_dest;
5131   gimple *epilog_stmt = NULL;
5132   gimple *exit_phi;
5133   tree bitsize;
5134   tree def;
5135   tree orig_name, scalar_result;
5136   imm_use_iterator imm_iter, phi_imm_iter;
5137   use_operand_p use_p, phi_use_p;
5138   gimple *use_stmt;
5139   auto_vec<tree> reduc_inputs;
5140   int j, i;
5141   vec<tree> &scalar_results = reduc_info->reduc_scalar_results;
5142   unsigned int group_size = 1, k;
5143   auto_vec<gimple *> phis;
5144   /* SLP reduction without reduction chain, e.g.,
5145      # a1 = phi <a2, a0>
5146      # b1 = phi <b2, b0>
5147      a2 = operation (a1)
5148      b2 = operation (b1)  */
5149   bool slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
5150   bool direct_slp_reduc;
5151   tree induction_index = NULL_TREE;
5152
5153   if (slp_node)
5154     group_size = SLP_TREE_LANES (slp_node);
5155
5156   if (nested_in_vect_loop_p (loop, stmt_info))
5157     {
5158       outer_loop = loop;
5159       loop = loop->inner;
5160       gcc_assert (!slp_node && double_reduc);
5161     }
5162
5163   vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
5164   gcc_assert (vectype);
5165   mode = TYPE_MODE (vectype);
5166
5167   tree induc_val = NULL_TREE;
5168   tree adjustment_def = NULL;
5169   if (slp_node)
5170     ;
5171   else
5172     {
5173       /* Optimize: for induction condition reduction, if we can't use zero
5174          for induc_val, use initial_def.  */
5175       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5176         induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
5177       else if (double_reduc)
5178         ;
5179       else
5180         adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
5181     }
5182
5183   stmt_vec_info single_live_out_stmt[] = { stmt_info };
5184   array_slice<const stmt_vec_info> live_out_stmts = single_live_out_stmt;
5185   if (slp_reduc)
5186     /* All statements produce live-out values.  */
5187     live_out_stmts = SLP_TREE_SCALAR_STMTS (slp_node);
5188   else if (slp_node)
5189     /* The last statement in the reduction chain produces the live-out
5190        value.  */
5191     single_live_out_stmt[0] = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5192
5193   unsigned vec_num;
5194   int ncopies;
5195   if (slp_node)
5196     {
5197       vec_num = SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis).length ();
5198       ncopies = 1;
5199     }
5200   else
5201     {
5202       stmt_vec_info reduc_info = loop_vinfo->lookup_stmt (reduc_def_stmt);
5203       vec_num = 1;
5204       ncopies = STMT_VINFO_VEC_STMTS (reduc_info).length ();
5205     }
5206
5207   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
5208      which is updated with the current index of the loop for every match of
5209      the original loop's cond_expr (VEC_STMT).  This results in a vector
5210      containing the last time the condition passed for that vector lane.
5211      The first match will be a 1 to allow 0 to be used for non-matching
5212      indexes.  If there are no matches at all then the vector will be all
5213      zeroes.
5214
5215      PR92772: This algorithm is broken for architectures that support
5216      masked vectors, but do not provide fold_extract_last.  */
5217   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
5218     {
5219       auto_vec<std::pair<tree, bool>, 2> ccompares;
5220       stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
5221       cond_info = vect_stmt_to_vectorize (cond_info);
5222       while (cond_info != reduc_info)
5223         {
5224           if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
5225             {
5226               gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
5227               gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
5228               ccompares.safe_push
5229                 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
5230                                  STMT_VINFO_REDUC_IDX (cond_info) == 2));
5231             }
5232           cond_info
5233             = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
5234                                                  1 + STMT_VINFO_REDUC_IDX
5235                                                         (cond_info)));
5236           cond_info = vect_stmt_to_vectorize (cond_info);
5237         }
5238       gcc_assert (ccompares.length () != 0);
5239
5240       tree indx_before_incr, indx_after_incr;
5241       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
5242       int scalar_precision
5243         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
5244       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
5245       tree cr_index_vector_type = get_related_vectype_for_scalar_type
5246         (TYPE_MODE (vectype), cr_index_scalar_type,
5247          TYPE_VECTOR_SUBPARTS (vectype));
5248
5249       /* First we create a simple vector induction variable which starts
5250          with the values {1,2,3,...} (SERIES_VECT) and increments by the
5251          vector size (STEP).  */
5252
5253       /* Create a {1,2,3,...} vector.  */
5254       tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
5255
5256       /* Create a vector of the step value.  */
5257       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
5258       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
5259
5260       /* Create an induction variable.  */
5261       gimple_stmt_iterator incr_gsi;
5262       bool insert_after;
5263       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
5264       create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
5265                  insert_after, &indx_before_incr, &indx_after_incr);
5266
5267       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
5268          filled with zeros (VEC_ZERO).  */
5269
5270       /* Create a vector of 0s.  */
5271       tree zero = build_zero_cst (cr_index_scalar_type);
5272       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
5273
5274       /* Create a vector phi node.  */
5275       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
5276       new_phi = create_phi_node (new_phi_tree, loop->header);
5277       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
5278                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
5279
5280       /* Now take the condition from the loops original cond_exprs
5281          and produce a new cond_exprs (INDEX_COND_EXPR) which for
5282          every match uses values from the induction variable
5283          (INDEX_BEFORE_INCR) otherwise uses values from the phi node
5284          (NEW_PHI_TREE).
5285          Finally, we update the phi (NEW_PHI_TREE) to take the value of
5286          the new cond_expr (INDEX_COND_EXPR).  */
5287       gimple_seq stmts = NULL;
5288       for (int i = ccompares.length () - 1; i != -1; --i)
5289         {
5290           tree ccompare = ccompares[i].first;
5291           if (ccompares[i].second)
5292             new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5293                                          cr_index_vector_type,
5294                                          ccompare,
5295                                          indx_before_incr, new_phi_tree);
5296           else
5297             new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5298                                          cr_index_vector_type,
5299                                          ccompare,
5300                                          new_phi_tree, indx_before_incr);
5301         }
5302       gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
5303
5304       /* Update the phi with the vec cond.  */
5305       induction_index = new_phi_tree;
5306       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
5307                    loop_latch_edge (loop), UNKNOWN_LOCATION);
5308     }
5309
5310   /* 2. Create epilog code.
5311         The reduction epilog code operates across the elements of the vector
5312         of partial results computed by the vectorized loop.
5313         The reduction epilog code consists of:
5314
5315         step 1: compute the scalar result in a vector (v_out2)
5316         step 2: extract the scalar result (s_out3) from the vector (v_out2)
5317         step 3: adjust the scalar result (s_out3) if needed.
5318
5319         Step 1 can be accomplished using one the following three schemes:
5320           (scheme 1) using reduc_fn, if available.
5321           (scheme 2) using whole-vector shifts, if available.
5322           (scheme 3) using a scalar loop. In this case steps 1+2 above are
5323                      combined.
5324
5325           The overall epilog code looks like this:
5326
5327           s_out0 = phi <s_loop>         # original EXIT_PHI
5328           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
5329           v_out2 = reduce <v_out1>              # step 1
5330           s_out3 = extract_field <v_out2, 0>    # step 2
5331           s_out4 = adjust_result <s_out3>       # step 3
5332
5333           (step 3 is optional, and steps 1 and 2 may be combined).
5334           Lastly, the uses of s_out0 are replaced by s_out4.  */
5335
5336
5337   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
5338          v_out1 = phi <VECT_DEF>
5339          Store them in NEW_PHIS.  */
5340   if (double_reduc)
5341     loop = outer_loop;
5342   exit_bb = single_exit (loop)->dest;
5343   exit_gsi = gsi_after_labels (exit_bb);
5344   reduc_inputs.create (slp_node ? vec_num : ncopies);
5345   for (unsigned i = 0; i < vec_num; i++)
5346     {
5347       gimple_seq stmts = NULL;
5348       if (slp_node)
5349         def = vect_get_slp_vect_def (slp_node, i);
5350       else
5351         def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]);
5352       for (j = 0; j < ncopies; j++)
5353         {
5354           tree new_def = copy_ssa_name (def);
5355           phi = create_phi_node (new_def, exit_bb);
5356           if (j)
5357             def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
5358           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
5359           new_def = gimple_convert (&stmts, vectype, new_def);
5360           reduc_inputs.quick_push (new_def);
5361         }
5362       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5363     }
5364
5365   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
5366          (i.e. when reduc_fn is not available) and in the final adjustment
5367          code (if needed).  Also get the original scalar reduction variable as
5368          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
5369          represents a reduction pattern), the tree-code and scalar-def are
5370          taken from the original stmt that the pattern-stmt (STMT) replaces.
5371          Otherwise (it is a regular reduction) - the tree-code and scalar-def
5372          are taken from STMT.  */
5373
5374   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
5375   if (orig_stmt_info != stmt_info)
5376     {
5377       /* Reduction pattern  */
5378       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
5379       gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
5380     }
5381
5382   scalar_dest = gimple_get_lhs (orig_stmt_info->stmt);
5383   scalar_type = TREE_TYPE (scalar_dest);
5384   scalar_results.create (group_size);
5385   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
5386   bitsize = TYPE_SIZE (scalar_type);
5387
5388   /* True if we should implement SLP_REDUC using native reduction operations
5389      instead of scalar operations.  */
5390   direct_slp_reduc = (reduc_fn != IFN_LAST
5391                       && slp_reduc
5392                       && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
5393
5394   /* In case of reduction chain, e.g.,
5395      # a1 = phi <a3, a0>
5396      a2 = operation (a1)
5397      a3 = operation (a2),
5398
5399      we may end up with more than one vector result.  Here we reduce them
5400      to one vector.
5401
5402      The same is true if we couldn't use a single defuse cycle.  */
5403   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)
5404       || direct_slp_reduc
5405       || ncopies > 1)
5406     {
5407       gimple_seq stmts = NULL;
5408       tree single_input = reduc_inputs[0];
5409       for (k = 1; k < reduc_inputs.length (); k++)
5410         single_input = gimple_build (&stmts, code, vectype,
5411                                      single_input, reduc_inputs[k]);
5412       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5413
5414       reduc_inputs.truncate (0);
5415       reduc_inputs.safe_push (single_input);
5416     }
5417
5418   tree orig_reduc_input = reduc_inputs[0];
5419
5420   /* If this loop is an epilogue loop that can be skipped after the
5421      main loop, we can only share a reduction operation between the
5422      main loop and the epilogue if we put it at the target of the
5423      skip edge.
5424
5425      We can still reuse accumulators if this check fails.  Doing so has
5426      the minor(?) benefit of making the epilogue loop's scalar result
5427      independent of the main loop's scalar result.  */
5428   bool unify_with_main_loop_p = false;
5429   if (reduc_info->reused_accumulator
5430       && loop_vinfo->skip_this_loop_edge
5431       && single_succ_p (exit_bb)
5432       && single_succ (exit_bb) == loop_vinfo->skip_this_loop_edge->dest)
5433     {
5434       unify_with_main_loop_p = true;
5435
5436       basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest;
5437       reduc_inputs[0] = make_ssa_name (vectype);
5438       gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block);
5439       add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (exit_bb),
5440                    UNKNOWN_LOCATION);
5441       add_phi_arg (new_phi, reduc_info->reused_accumulator->reduc_input,
5442                    loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION);
5443       exit_gsi = gsi_after_labels (reduc_block);
5444     }
5445
5446   /* Shouldn't be used beyond this point.  */
5447   exit_bb = nullptr;
5448
5449   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
5450       && reduc_fn != IFN_LAST)
5451     {
5452       /* For condition reductions, we have a vector (REDUC_INPUTS 0) containing
5453          various data values where the condition matched and another vector
5454          (INDUCTION_INDEX) containing all the indexes of those matches.  We
5455          need to extract the last matching index (which will be the index with
5456          highest value) and use this to index into the data vector.
5457          For the case where there were no matches, the data vector will contain
5458          all default values and the index vector will be all zeros.  */
5459
5460       /* Get various versions of the type of the vector of indexes.  */
5461       tree index_vec_type = TREE_TYPE (induction_index);
5462       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
5463       tree index_scalar_type = TREE_TYPE (index_vec_type);
5464       tree index_vec_cmp_type = truth_type_for (index_vec_type);
5465
5466       /* Get an unsigned integer version of the type of the data vector.  */
5467       int scalar_precision
5468         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
5469       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
5470       tree vectype_unsigned = get_same_sized_vectype (scalar_type_unsigned,
5471                                                 vectype);
5472
5473       /* First we need to create a vector (ZERO_VEC) of zeros and another
5474          vector (MAX_INDEX_VEC) filled with the last matching index, which we
5475          can create using a MAX reduction and then expanding.
5476          In the case where the loop never made any matches, the max index will
5477          be zero.  */
5478
5479       /* Vector of {0, 0, 0,...}.  */
5480       tree zero_vec = build_zero_cst (vectype);
5481
5482       /* Find maximum value from the vector of found indexes.  */
5483       tree max_index = make_ssa_name (index_scalar_type);
5484       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5485                                                           1, induction_index);
5486       gimple_call_set_lhs (max_index_stmt, max_index);
5487       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
5488
5489       /* Vector of {max_index, max_index, max_index,...}.  */
5490       tree max_index_vec = make_ssa_name (index_vec_type);
5491       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
5492                                                       max_index);
5493       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
5494                                                         max_index_vec_rhs);
5495       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
5496
5497       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
5498          with the vector (INDUCTION_INDEX) of found indexes, choosing values
5499          from the data vector (REDUC_INPUTS 0) for matches, 0 (ZERO_VEC)
5500          otherwise.  Only one value should match, resulting in a vector
5501          (VEC_COND) with one data value and the rest zeros.
5502          In the case where the loop never made any matches, every index will
5503          match, resulting in a vector with all data values (which will all be
5504          the default value).  */
5505
5506       /* Compare the max index vector to the vector of found indexes to find
5507          the position of the max value.  */
5508       tree vec_compare = make_ssa_name (index_vec_cmp_type);
5509       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
5510                                                       induction_index,
5511                                                       max_index_vec);
5512       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
5513
5514       /* Use the compare to choose either values from the data vector or
5515          zero.  */
5516       tree vec_cond = make_ssa_name (vectype);
5517       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
5518                                                    vec_compare,
5519                                                    reduc_inputs[0],
5520                                                    zero_vec);
5521       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
5522
5523       /* Finally we need to extract the data value from the vector (VEC_COND)
5524          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
5525          reduction, but because this doesn't exist, we can use a MAX reduction
5526          instead.  The data value might be signed or a float so we need to cast
5527          it first.
5528          In the case where the loop never made any matches, the data values are
5529          all identical, and so will reduce down correctly.  */
5530
5531       /* Make the matched data values unsigned.  */
5532       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
5533       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
5534                                        vec_cond);
5535       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
5536                                                         VIEW_CONVERT_EXPR,
5537                                                         vec_cond_cast_rhs);
5538       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
5539
5540       /* Reduce down to a scalar value.  */
5541       tree data_reduc = make_ssa_name (scalar_type_unsigned);
5542       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5543                                                            1, vec_cond_cast);
5544       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
5545       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
5546
5547       /* Convert the reduced value back to the result type and set as the
5548          result.  */
5549       gimple_seq stmts = NULL;
5550       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
5551                                data_reduc);
5552       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5553       scalar_results.safe_push (new_temp);
5554     }
5555   else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
5556            && reduc_fn == IFN_LAST)
5557     {
5558       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
5559          idx = 0;
5560          idx_val = induction_index[0];
5561          val = data_reduc[0];
5562          for (idx = 0, val = init, i = 0; i < nelts; ++i)
5563            if (induction_index[i] > idx_val)
5564              val = data_reduc[i], idx_val = induction_index[i];
5565          return val;  */
5566
5567       tree data_eltype = TREE_TYPE (vectype);
5568       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5569       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5570       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5571       /* Enforced by vectorizable_reduction, which ensures we have target
5572          support before allowing a conditional reduction on variable-length
5573          vectors.  */
5574       unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5575       tree idx_val = NULL_TREE, val = NULL_TREE;
5576       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5577         {
5578           tree old_idx_val = idx_val;
5579           tree old_val = val;
5580           idx_val = make_ssa_name (idx_eltype);
5581           epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5582                                              build3 (BIT_FIELD_REF, idx_eltype,
5583                                                      induction_index,
5584                                                      bitsize_int (el_size),
5585                                                      bitsize_int (off)));
5586           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5587           val = make_ssa_name (data_eltype);
5588           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5589                                              build3 (BIT_FIELD_REF,
5590                                                      data_eltype,
5591                                                      reduc_inputs[0],
5592                                                      bitsize_int (el_size),
5593                                                      bitsize_int (off)));
5594           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5595           if (off != 0)
5596             {
5597               tree new_idx_val = idx_val;
5598               if (off != v_size - el_size)
5599                 {
5600                   new_idx_val = make_ssa_name (idx_eltype);
5601                   epilog_stmt = gimple_build_assign (new_idx_val,
5602                                                      MAX_EXPR, idx_val,
5603                                                      old_idx_val);
5604                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5605                 }
5606               tree new_val = make_ssa_name (data_eltype);
5607               epilog_stmt = gimple_build_assign (new_val,
5608                                                  COND_EXPR,
5609                                                  build2 (GT_EXPR,
5610                                                          boolean_type_node,
5611                                                          idx_val,
5612                                                          old_idx_val),
5613                                                  val, old_val);
5614               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5615               idx_val = new_idx_val;
5616               val = new_val;
5617             }
5618         }
5619       /* Convert the reduced value back to the result type and set as the
5620          result.  */
5621       gimple_seq stmts = NULL;
5622       val = gimple_convert (&stmts, scalar_type, val);
5623       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5624       scalar_results.safe_push (val);
5625     }
5626
5627   /* 2.3 Create the reduction code, using one of the three schemes described
5628          above. In SLP we simply need to extract all the elements from the
5629          vector (without reducing them), so we use scalar shifts.  */
5630   else if (reduc_fn != IFN_LAST && !slp_reduc)
5631     {
5632       tree tmp;
5633       tree vec_elem_type;
5634
5635       /* Case 1:  Create:
5636          v_out2 = reduc_expr <v_out1>  */
5637
5638       if (dump_enabled_p ())
5639         dump_printf_loc (MSG_NOTE, vect_location,
5640                          "Reduce using direct vector reduction.\n");
5641
5642       gimple_seq stmts = NULL;
5643       vec_elem_type = TREE_TYPE (vectype);
5644       new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
5645                                vec_elem_type, reduc_inputs[0]);
5646       new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5647       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5648
5649       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5650           && induc_val)
5651         {
5652           /* Earlier we set the initial value to be a vector if induc_val
5653              values.  Check the result and if it is induc_val then replace
5654              with the original initial value, unless induc_val is
5655              the same as initial_def already.  */
5656           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5657                                   induc_val);
5658           tree initial_def = reduc_info->reduc_initial_values[0];
5659
5660           tmp = make_ssa_name (new_scalar_dest);
5661           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5662                                              initial_def, new_temp);
5663           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5664           new_temp = tmp;
5665         }
5666
5667       scalar_results.safe_push (new_temp);
5668     }
5669   else if (direct_slp_reduc)
5670     {
5671       /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5672          with the elements for other SLP statements replaced with the
5673          neutral value.  We can then do a normal reduction on each vector.  */
5674
5675       /* Enforced by vectorizable_reduction.  */
5676       gcc_assert (reduc_inputs.length () == 1);
5677       gcc_assert (pow2p_hwi (group_size));
5678
5679       gimple_seq seq = NULL;
5680
5681       /* Build a vector {0, 1, 2, ...}, with the same number of elements
5682          and the same element size as VECTYPE.  */
5683       tree index = build_index_vector (vectype, 0, 1);
5684       tree index_type = TREE_TYPE (index);
5685       tree index_elt_type = TREE_TYPE (index_type);
5686       tree mask_type = truth_type_for (index_type);
5687
5688       /* Create a vector that, for each element, identifies which of
5689          the REDUC_GROUP_SIZE results should use it.  */
5690       tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5691       index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5692                             build_vector_from_val (index_type, index_mask));
5693
5694       /* Get a neutral vector value.  This is simply a splat of the neutral
5695          scalar value if we have one, otherwise the initial scalar value
5696          is itself a neutral value.  */
5697       tree vector_identity = NULL_TREE;
5698       tree neutral_op = NULL_TREE;
5699       if (slp_node)
5700         {
5701           tree initial_value = NULL_TREE;
5702           if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5703             initial_value = reduc_info->reduc_initial_values[0];
5704           neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
5705                                                  initial_value);
5706         }
5707       if (neutral_op)
5708         vector_identity = gimple_build_vector_from_val (&seq, vectype,
5709                                                         neutral_op);
5710       for (unsigned int i = 0; i < group_size; ++i)
5711         {
5712           /* If there's no univeral neutral value, we can use the
5713              initial scalar value from the original PHI.  This is used
5714              for MIN and MAX reduction, for example.  */
5715           if (!neutral_op)
5716             {
5717               tree scalar_value = reduc_info->reduc_initial_values[i];
5718               scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
5719                                              scalar_value);
5720               vector_identity = gimple_build_vector_from_val (&seq, vectype,
5721                                                               scalar_value);
5722             }
5723
5724           /* Calculate the equivalent of:
5725
5726              sel[j] = (index[j] == i);
5727
5728              which selects the elements of REDUC_INPUTS[0] that should
5729              be included in the result.  */
5730           tree compare_val = build_int_cst (index_elt_type, i);
5731           compare_val = build_vector_from_val (index_type, compare_val);
5732           tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5733                                    index, compare_val);
5734
5735           /* Calculate the equivalent of:
5736
5737              vec = seq ? reduc_inputs[0] : vector_identity;
5738
5739              VEC is now suitable for a full vector reduction.  */
5740           tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5741                                    sel, reduc_inputs[0], vector_identity);
5742
5743           /* Do the reduction and convert it to the appropriate type.  */
5744           tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5745                                       TREE_TYPE (vectype), vec);
5746           scalar = gimple_convert (&seq, scalar_type, scalar);
5747           scalar_results.safe_push (scalar);
5748         }
5749       gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5750     }
5751   else
5752     {
5753       bool reduce_with_shift;
5754       tree vec_temp;
5755
5756       gcc_assert (slp_reduc || reduc_inputs.length () == 1);
5757
5758       /* See if the target wants to do the final (shift) reduction
5759          in a vector mode of smaller size and first reduce upper/lower
5760          halves against each other.  */
5761       enum machine_mode mode1 = mode;
5762       tree stype = TREE_TYPE (vectype);
5763       unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5764       unsigned nunits1 = nunits;
5765       if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
5766           && reduc_inputs.length () == 1)
5767         {
5768           nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5769           /* For SLP reductions we have to make sure lanes match up, but
5770              since we're doing individual element final reduction reducing
5771              vector width here is even more important.
5772              ???  We can also separate lanes with permutes, for the common
5773              case of power-of-two group-size odd/even extracts would work.  */
5774           if (slp_reduc && nunits != nunits1)
5775             {
5776               nunits1 = least_common_multiple (nunits1, group_size);
5777               gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
5778             }
5779         }
5780       if (!slp_reduc
5781           && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5782         nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5783
5784       tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5785                                                            stype, nunits1);
5786       reduce_with_shift = have_whole_vector_shift (mode1);
5787       if (!VECTOR_MODE_P (mode1))
5788         reduce_with_shift = false;
5789       else
5790         {
5791           optab optab = optab_for_tree_code (code, vectype1, optab_default);
5792           if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5793             reduce_with_shift = false;
5794         }
5795
5796       /* First reduce the vector to the desired vector size we should
5797          do shift reduction on by combining upper and lower halves.  */
5798       gimple_seq stmts = NULL;
5799       new_temp = vect_create_partial_epilog (reduc_inputs[0], vectype1,
5800                                              code, &stmts);
5801       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5802       reduc_inputs[0] = new_temp;
5803
5804       if (reduce_with_shift && !slp_reduc)
5805         {
5806           int element_bitsize = tree_to_uhwi (bitsize);
5807           /* Enforced by vectorizable_reduction, which disallows SLP reductions
5808              for variable-length vectors and also requires direct target support
5809              for loop reductions.  */
5810           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5811           int nelements = vec_size_in_bits / element_bitsize;
5812           vec_perm_builder sel;
5813           vec_perm_indices indices;
5814
5815           int elt_offset;
5816
5817           tree zero_vec = build_zero_cst (vectype1);
5818           /* Case 2: Create:
5819              for (offset = nelements/2; offset >= 1; offset/=2)
5820                 {
5821                   Create:  va' = vec_shift <va, offset>
5822                   Create:  va = vop <va, va'>
5823                 }  */
5824
5825           tree rhs;
5826
5827           if (dump_enabled_p ())
5828             dump_printf_loc (MSG_NOTE, vect_location,
5829                              "Reduce using vector shifts\n");
5830
5831           gimple_seq stmts = NULL;
5832           new_temp = gimple_convert (&stmts, vectype1, new_temp);
5833           for (elt_offset = nelements / 2;
5834                elt_offset >= 1;
5835                elt_offset /= 2)
5836             {
5837               calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5838               indices.new_vector (sel, 2, nelements);
5839               tree mask = vect_gen_perm_mask_any (vectype1, indices);
5840               new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
5841                                        new_temp, zero_vec, mask);
5842               new_temp = gimple_build (&stmts, code,
5843                                        vectype1, new_name, new_temp);
5844             }
5845           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5846
5847           /* 2.4  Extract the final scalar result.  Create:
5848              s_out3 = extract_field <v_out2, bitpos>  */
5849
5850           if (dump_enabled_p ())
5851             dump_printf_loc (MSG_NOTE, vect_location,
5852                              "extract scalar result\n");
5853
5854           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5855                         bitsize, bitsize_zero_node);
5856           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5857           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5858           gimple_assign_set_lhs (epilog_stmt, new_temp);
5859           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5860           scalar_results.safe_push (new_temp);
5861         }
5862       else
5863         {
5864           /* Case 3: Create:
5865              s = extract_field <v_out2, 0>
5866              for (offset = element_size;
5867                   offset < vector_size;
5868                   offset += element_size;)
5869                {
5870                  Create:  s' = extract_field <v_out2, offset>
5871                  Create:  s = op <s, s'>  // For non SLP cases
5872                }  */
5873
5874           if (dump_enabled_p ())
5875             dump_printf_loc (MSG_NOTE, vect_location,
5876                              "Reduce using scalar code.\n");
5877
5878           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5879           int element_bitsize = tree_to_uhwi (bitsize);
5880           tree compute_type = TREE_TYPE (vectype);
5881           gimple_seq stmts = NULL;
5882           FOR_EACH_VEC_ELT (reduc_inputs, i, vec_temp)
5883             {
5884               int bit_offset;
5885               new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
5886                                        vec_temp, bitsize, bitsize_zero_node);
5887
5888               /* In SLP we don't need to apply reduction operation, so we just
5889                  collect s' values in SCALAR_RESULTS.  */
5890               if (slp_reduc)
5891                 scalar_results.safe_push (new_temp);
5892
5893               for (bit_offset = element_bitsize;
5894                    bit_offset < vec_size_in_bits;
5895                    bit_offset += element_bitsize)
5896                 {
5897                   tree bitpos = bitsize_int (bit_offset);
5898                   new_name = gimple_build (&stmts, BIT_FIELD_REF,
5899                                            compute_type, vec_temp,
5900                                            bitsize, bitpos);
5901                   if (slp_reduc)
5902                     {
5903                       /* In SLP we don't need to apply reduction operation, so
5904                          we just collect s' values in SCALAR_RESULTS.  */
5905                       new_temp = new_name;
5906                       scalar_results.safe_push (new_name);
5907                     }
5908                   else
5909                     new_temp = gimple_build (&stmts, code, compute_type,
5910                                              new_name, new_temp);
5911                 }
5912             }
5913
5914           /* The only case where we need to reduce scalar results in SLP, is
5915              unrolling.  If the size of SCALAR_RESULTS is greater than
5916              REDUC_GROUP_SIZE, we reduce them combining elements modulo
5917              REDUC_GROUP_SIZE.  */
5918           if (slp_reduc)
5919             {
5920               tree res, first_res, new_res;
5921
5922               /* Reduce multiple scalar results in case of SLP unrolling.  */
5923               for (j = group_size; scalar_results.iterate (j, &res);
5924                    j++)
5925                 {
5926                   first_res = scalar_results[j % group_size];
5927                   new_res = gimple_build (&stmts, code, compute_type,
5928                                           first_res, res);
5929                   scalar_results[j % group_size] = new_res;
5930                 }
5931               scalar_results.truncate (group_size);
5932               for (k = 0; k < group_size; k++)
5933                 scalar_results[k] = gimple_convert (&stmts, scalar_type,
5934                                                     scalar_results[k]);
5935             }
5936           else
5937             {
5938               /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
5939               new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5940               scalar_results.safe_push (new_temp);
5941             }
5942
5943           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5944         }
5945
5946       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5947           && induc_val)
5948         {
5949           /* Earlier we set the initial value to be a vector if induc_val
5950              values.  Check the result and if it is induc_val then replace
5951              with the original initial value, unless induc_val is
5952              the same as initial_def already.  */
5953           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5954                                   induc_val);
5955           tree initial_def = reduc_info->reduc_initial_values[0];
5956
5957           tree tmp = make_ssa_name (new_scalar_dest);
5958           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5959                                              initial_def, new_temp);
5960           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5961           scalar_results[0] = tmp;
5962         }
5963     }
5964
5965   /* 2.5 Adjust the final result by the initial value of the reduction
5966          variable. (When such adjustment is not needed, then
5967          'adjustment_def' is zero).  For example, if code is PLUS we create:
5968          new_temp = loop_exit_def + adjustment_def  */
5969
5970   if (adjustment_def)
5971     {
5972       gcc_assert (!slp_reduc);
5973       gimple_seq stmts = NULL;
5974       if (double_reduc)
5975         {
5976           gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
5977           adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
5978           new_temp = gimple_build (&stmts, code, vectype,
5979                                    reduc_inputs[0], adjustment_def);
5980         }
5981       else
5982         {
5983           new_temp = scalar_results[0];
5984           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5985           adjustment_def = gimple_convert (&stmts, scalar_type, adjustment_def);
5986           new_temp = gimple_build (&stmts, code, scalar_type,
5987                                    new_temp, adjustment_def);
5988         }
5989
5990       epilog_stmt = gimple_seq_last_stmt (stmts);
5991       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5992       scalar_results[0] = new_temp;
5993     }
5994
5995   /* Record this operation if it could be reused by the epilogue loop.  */
5996   if (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION)
5997     loop_vinfo->reusable_accumulators.put (scalar_results[0],
5998                                            { orig_reduc_input, reduc_info });
5999
6000   if (double_reduc)
6001     loop = outer_loop;
6002
6003   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
6004           phis with new adjusted scalar results, i.e., replace use <s_out0>
6005           with use <s_out4>.
6006
6007      Transform:
6008         loop_exit:
6009           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
6010           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
6011           v_out2 = reduce <v_out1>
6012           s_out3 = extract_field <v_out2, 0>
6013           s_out4 = adjust_result <s_out3>
6014           use <s_out0>
6015           use <s_out0>
6016
6017      into:
6018
6019         loop_exit:
6020           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
6021           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
6022           v_out2 = reduce <v_out1>
6023           s_out3 = extract_field <v_out2, 0>
6024           s_out4 = adjust_result <s_out3>
6025           use <s_out4>
6026           use <s_out4> */
6027
6028   gcc_assert (live_out_stmts.size () == scalar_results.length ());
6029   for (k = 0; k < live_out_stmts.size (); k++)
6030     {
6031       stmt_vec_info scalar_stmt_info = vect_orig_stmt (live_out_stmts[k]);
6032       scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt);
6033
6034       phis.create (3);
6035       /* Find the loop-closed-use at the loop exit of the original scalar
6036          result.  (The reduction result is expected to have two immediate uses,
6037          one at the latch block, and one at the loop exit).  For double
6038          reductions we are looking for exit phis of the outer loop.  */
6039       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6040         {
6041           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6042             {
6043               if (!is_gimple_debug (USE_STMT (use_p)))
6044                 phis.safe_push (USE_STMT (use_p));
6045             }
6046           else
6047             {
6048               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
6049                 {
6050                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
6051
6052                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
6053                     {
6054                       if (!flow_bb_inside_loop_p (loop,
6055                                              gimple_bb (USE_STMT (phi_use_p)))
6056                           && !is_gimple_debug (USE_STMT (phi_use_p)))
6057                         phis.safe_push (USE_STMT (phi_use_p));
6058                     }
6059                 }
6060             }
6061         }
6062
6063       FOR_EACH_VEC_ELT (phis, i, exit_phi)
6064         {
6065           /* Replace the uses:  */
6066           orig_name = PHI_RESULT (exit_phi);
6067
6068           /* Look for a single use at the target of the skip edge.  */
6069           if (unify_with_main_loop_p)
6070             {
6071               use_operand_p use_p;
6072               gimple *user;
6073               if (!single_imm_use (orig_name, &use_p, &user))
6074                 gcc_unreachable ();
6075               orig_name = gimple_get_lhs (user);
6076             }
6077
6078           scalar_result = scalar_results[k];
6079           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6080             {
6081               FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6082                 SET_USE (use_p, scalar_result);
6083               update_stmt (use_stmt);
6084             }
6085         }
6086
6087       phis.release ();
6088     }
6089 }
6090
6091 /* Return a vector of type VECTYPE that is equal to the vector select
6092    operation "MASK ? VEC : IDENTITY".  Insert the select statements
6093    before GSI.  */
6094
6095 static tree
6096 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
6097                      tree vec, tree identity)
6098 {
6099   tree cond = make_temp_ssa_name (vectype, NULL, "cond");
6100   gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
6101                                           mask, vec, identity);
6102   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6103   return cond;
6104 }
6105
6106 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6107    order, starting with LHS.  Insert the extraction statements before GSI and
6108    associate the new scalar SSA names with variable SCALAR_DEST.
6109    Return the SSA name for the result.  */
6110
6111 static tree
6112 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
6113                        tree_code code, tree lhs, tree vector_rhs)
6114 {
6115   tree vectype = TREE_TYPE (vector_rhs);
6116   tree scalar_type = TREE_TYPE (vectype);
6117   tree bitsize = TYPE_SIZE (scalar_type);
6118   unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
6119   unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
6120
6121   for (unsigned HOST_WIDE_INT bit_offset = 0;
6122        bit_offset < vec_size_in_bits;
6123        bit_offset += element_bitsize)
6124     {
6125       tree bitpos = bitsize_int (bit_offset);
6126       tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
6127                          bitsize, bitpos);
6128
6129       gassign *stmt = gimple_build_assign (scalar_dest, rhs);
6130       rhs = make_ssa_name (scalar_dest, stmt);
6131       gimple_assign_set_lhs (stmt, rhs);
6132       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6133
6134       stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
6135       tree new_name = make_ssa_name (scalar_dest, stmt);
6136       gimple_assign_set_lhs (stmt, new_name);
6137       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6138       lhs = new_name;
6139     }
6140   return lhs;
6141 }
6142
6143 /* Get a masked internal function equivalent to REDUC_FN.  VECTYPE_IN is the
6144    type of the vector input.  */
6145
6146 static internal_fn
6147 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
6148 {
6149   internal_fn mask_reduc_fn;
6150
6151   switch (reduc_fn)
6152     {
6153     case IFN_FOLD_LEFT_PLUS:
6154       mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
6155       break;
6156
6157     default:
6158       return IFN_LAST;
6159     }
6160
6161   if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
6162                                       OPTIMIZE_FOR_SPEED))
6163     return mask_reduc_fn;
6164   return IFN_LAST;
6165 }
6166
6167 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT_INFO is the
6168    statement that sets the live-out value.  REDUC_DEF_STMT is the phi
6169    statement.  CODE is the operation performed by STMT_INFO and OPS are
6170    its scalar operands.  REDUC_INDEX is the index of the operand in
6171    OPS that is set by REDUC_DEF_STMT.  REDUC_FN is the function that
6172    implements in-order reduction, or IFN_LAST if we should open-code it.
6173    VECTYPE_IN is the type of the vector input.  MASKS specifies the masks
6174    that should be used to control the operation in a fully-masked loop.  */
6175
6176 static bool
6177 vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
6178                                stmt_vec_info stmt_info,
6179                                gimple_stmt_iterator *gsi,
6180                                gimple **vec_stmt, slp_tree slp_node,
6181                                gimple *reduc_def_stmt,
6182                                tree_code code, internal_fn reduc_fn,
6183                                tree ops[3], tree vectype_in,
6184                                int reduc_index, vec_loop_masks *masks)
6185 {
6186   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6187   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6188   internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
6189
6190   int ncopies;
6191   if (slp_node)
6192     ncopies = 1;
6193   else
6194     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6195
6196   gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
6197   gcc_assert (ncopies == 1);
6198   gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
6199
6200   if (slp_node)
6201     gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
6202                           TYPE_VECTOR_SUBPARTS (vectype_in)));
6203
6204   tree op0 = ops[1 - reduc_index];
6205
6206   int group_size = 1;
6207   stmt_vec_info scalar_dest_def_info;
6208   auto_vec<tree> vec_oprnds0;
6209   if (slp_node)
6210     {
6211       auto_vec<vec<tree> > vec_defs (2);
6212       vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs);
6213       vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
6214       vec_defs[0].release ();
6215       vec_defs[1].release ();
6216       group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6217       scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
6218     }
6219   else
6220     {
6221       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
6222                                      op0, &vec_oprnds0);
6223       scalar_dest_def_info = stmt_info;
6224     }
6225
6226   tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
6227   tree scalar_type = TREE_TYPE (scalar_dest);
6228   tree reduc_var = gimple_phi_result (reduc_def_stmt);
6229
6230   int vec_num = vec_oprnds0.length ();
6231   gcc_assert (vec_num == 1 || slp_node);
6232   tree vec_elem_type = TREE_TYPE (vectype_out);
6233   gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
6234
6235   tree vector_identity = NULL_TREE;
6236   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6237     vector_identity = build_zero_cst (vectype_out);
6238
6239   tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
6240   int i;
6241   tree def0;
6242   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6243     {
6244       gimple *new_stmt;
6245       tree mask = NULL_TREE;
6246       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6247         mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
6248
6249       /* Handle MINUS by adding the negative.  */
6250       if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
6251         {
6252           tree negated = make_ssa_name (vectype_out);
6253           new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
6254           gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6255           def0 = negated;
6256         }
6257
6258       if (mask && mask_reduc_fn == IFN_LAST)
6259         def0 = merge_with_identity (gsi, mask, vectype_out, def0,
6260                                     vector_identity);
6261
6262       /* On the first iteration the input is simply the scalar phi
6263          result, and for subsequent iterations it is the output of
6264          the preceding operation.  */
6265       if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
6266         {
6267           if (mask && mask_reduc_fn != IFN_LAST)
6268             new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
6269                                                    def0, mask);
6270           else
6271             new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
6272                                                    def0);
6273           /* For chained SLP reductions the output of the previous reduction
6274              operation serves as the input of the next. For the final statement
6275              the output cannot be a temporary - we reuse the original
6276              scalar destination of the last statement.  */
6277           if (i != vec_num - 1)
6278             {
6279               gimple_set_lhs (new_stmt, scalar_dest_var);
6280               reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
6281               gimple_set_lhs (new_stmt, reduc_var);
6282             }
6283         }
6284       else
6285         {
6286           reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
6287                                              reduc_var, def0);
6288           new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6289           /* Remove the statement, so that we can use the same code paths
6290              as for statements that we've just created.  */
6291           gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6292           gsi_remove (&tmp_gsi, true);
6293         }
6294
6295       if (i == vec_num - 1)
6296         {
6297           gimple_set_lhs (new_stmt, scalar_dest);
6298           vect_finish_replace_stmt (loop_vinfo,
6299                                     scalar_dest_def_info,
6300                                     new_stmt);
6301         }
6302       else
6303         vect_finish_stmt_generation (loop_vinfo,
6304                                      scalar_dest_def_info,
6305                                      new_stmt, gsi);
6306
6307       if (slp_node)
6308         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6309       else
6310         {
6311           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
6312           *vec_stmt = new_stmt;
6313         }
6314     }
6315
6316   return true;
6317 }
6318
6319 /* Function is_nonwrapping_integer_induction.
6320
6321    Check if STMT_VINO (which is part of loop LOOP) both increments and
6322    does not cause overflow.  */
6323
6324 static bool
6325 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
6326 {
6327   gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
6328   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
6329   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
6330   tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
6331   widest_int ni, max_loop_value, lhs_max;
6332   wi::overflow_type overflow = wi::OVF_NONE;
6333
6334   /* Make sure the loop is integer based.  */
6335   if (TREE_CODE (base) != INTEGER_CST
6336       || TREE_CODE (step) != INTEGER_CST)
6337     return false;
6338
6339   /* Check that the max size of the loop will not wrap.  */
6340
6341   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6342     return true;
6343
6344   if (! max_stmt_executions (loop, &ni))
6345     return false;
6346
6347   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6348                             &overflow);
6349   if (overflow)
6350     return false;
6351
6352   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6353                             TYPE_SIGN (lhs_type), &overflow);
6354   if (overflow)
6355     return false;
6356
6357   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6358           <= TYPE_PRECISION (lhs_type));
6359 }
6360
6361 /* Check if masking can be supported by inserting a conditional expression.
6362    CODE is the code for the operation.  COND_FN is the conditional internal
6363    function, if it exists.  VECTYPE_IN is the type of the vector input.  */
6364 static bool
6365 use_mask_by_cond_expr_p (enum tree_code code, internal_fn cond_fn,
6366                          tree vectype_in)
6367 {
6368   if (cond_fn != IFN_LAST
6369       && direct_internal_fn_supported_p (cond_fn, vectype_in,
6370                                          OPTIMIZE_FOR_SPEED))
6371     return false;
6372
6373   switch (code)
6374     {
6375     case DOT_PROD_EXPR:
6376     case SAD_EXPR:
6377       return true;
6378
6379     default:
6380       return false;
6381     }
6382 }
6383
6384 /* Insert a conditional expression to enable masked vectorization.  CODE is the
6385    code for the operation.  VOP is the array of operands.  MASK is the loop
6386    mask.  GSI is a statement iterator used to place the new conditional
6387    expression.  */
6388 static void
6389 build_vect_cond_expr (enum tree_code code, tree vop[3], tree mask,
6390                       gimple_stmt_iterator *gsi)
6391 {
6392   switch (code)
6393     {
6394     case DOT_PROD_EXPR:
6395       {
6396         tree vectype = TREE_TYPE (vop[1]);
6397         tree zero = build_zero_cst (vectype);
6398         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6399         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6400                                                mask, vop[1], zero);
6401         gsi_insert_before (gsi, select, GSI_SAME_STMT);
6402         vop[1] = masked_op1;
6403         break;
6404       }
6405
6406     case SAD_EXPR:
6407       {
6408         tree vectype = TREE_TYPE (vop[1]);
6409         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6410         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6411                                                mask, vop[1], vop[0]);
6412         gsi_insert_before (gsi, select, GSI_SAME_STMT);
6413         vop[1] = masked_op1;
6414         break;
6415       }
6416
6417     default:
6418       gcc_unreachable ();
6419     }
6420 }
6421
6422 /* Function vectorizable_reduction.
6423
6424    Check if STMT_INFO performs a reduction operation that can be vectorized.
6425    If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
6426    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6427    Return true if STMT_INFO is vectorizable in this way.
6428
6429    This function also handles reduction idioms (patterns) that have been
6430    recognized in advance during vect_pattern_recog.  In this case, STMT_INFO
6431    may be of this form:
6432      X = pattern_expr (arg0, arg1, ..., X)
6433    and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
6434    sequence that had been detected and replaced by the pattern-stmt
6435    (STMT_INFO).
6436
6437    This function also handles reduction of condition expressions, for example:
6438      for (int i = 0; i < N; i++)
6439        if (a[i] < value)
6440          last = a[i];
6441    This is handled by vectorising the loop and creating an additional vector
6442    containing the loop indexes for which "a[i] < value" was true.  In the
6443    function epilogue this is reduced to a single max value and then used to
6444    index into the vector of results.
6445
6446    In some cases of reduction patterns, the type of the reduction variable X is
6447    different than the type of the other arguments of STMT_INFO.
6448    In such cases, the vectype that is used when transforming STMT_INFO into
6449    a vector stmt is different than the vectype that is used to determine the
6450    vectorization factor, because it consists of a different number of elements
6451    than the actual number of elements that are being operated upon in parallel.
6452
6453    For example, consider an accumulation of shorts into an int accumulator.
6454    On some targets it's possible to vectorize this pattern operating on 8
6455    shorts at a time (hence, the vectype for purposes of determining the
6456    vectorization factor should be V8HI); on the other hand, the vectype that
6457    is used to create the vector form is actually V4SI (the type of the result).
6458
6459    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6460    indicates what is the actual level of parallelism (V8HI in the example), so
6461    that the right vectorization factor would be derived.  This vectype
6462    corresponds to the type of arguments to the reduction stmt, and should *NOT*
6463    be used to create the vectorized stmt.  The right vectype for the vectorized
6464    stmt is obtained from the type of the result X:
6465       get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6466
6467    This means that, contrary to "regular" reductions (or "regular" stmts in
6468    general), the following equation:
6469       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6470    does *NOT* necessarily hold for reduction patterns.  */
6471
6472 bool
6473 vectorizable_reduction (loop_vec_info loop_vinfo,
6474                         stmt_vec_info stmt_info, slp_tree slp_node,
6475                         slp_instance slp_node_instance,
6476                         stmt_vector_for_cost *cost_vec)
6477 {
6478   tree scalar_dest;
6479   tree vectype_in = NULL_TREE;
6480   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6481   enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
6482   stmt_vec_info cond_stmt_vinfo = NULL;
6483   tree scalar_type;
6484   int i;
6485   int ncopies;
6486   bool single_defuse_cycle = false;
6487   bool nested_cycle = false;
6488   bool double_reduc = false;
6489   int vec_num;
6490   tree tem;
6491   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6492   tree cond_reduc_val = NULL_TREE;
6493
6494   /* Make sure it was already recognized as a reduction computation.  */
6495   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
6496       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
6497       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
6498     return false;
6499
6500   /* The stmt we store reduction analysis meta on.  */
6501   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
6502   reduc_info->is_reduc_info = true;
6503
6504   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
6505     {
6506       if (is_a <gphi *> (stmt_info->stmt))
6507         {
6508           if (slp_node)
6509             {
6510               /* We eventually need to set a vector type on invariant
6511                  arguments.  */
6512               unsigned j;
6513               slp_tree child;
6514               FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
6515                 if (!vect_maybe_update_slp_op_vectype
6516                        (child, SLP_TREE_VECTYPE (slp_node)))
6517                   {
6518                     if (dump_enabled_p ())
6519                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6520                                        "incompatible vector types for "
6521                                        "invariants\n");
6522                     return false;
6523                   }
6524             }
6525           /* Analysis for double-reduction is done on the outer
6526              loop PHI, nested cycles have no further restrictions.  */
6527           STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
6528         }
6529       else
6530         STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6531       return true;
6532     }
6533
6534   stmt_vec_info orig_stmt_of_analysis = stmt_info;
6535   stmt_vec_info phi_info = stmt_info;
6536   if (!is_a <gphi *> (stmt_info->stmt))
6537     {
6538       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6539       return true;
6540     }
6541   if (slp_node)
6542     {
6543       slp_node_instance->reduc_phis = slp_node;
6544       /* ???  We're leaving slp_node to point to the PHIs, we only
6545          need it to get at the number of vector stmts which wasn't
6546          yet initialized for the instance root.  */
6547     }
6548   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
6549     stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (stmt_info));
6550   else
6551     {
6552       gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info)
6553                   == vect_double_reduction_def);
6554       use_operand_p use_p;
6555       gimple *use_stmt;
6556       bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
6557                                  &use_p, &use_stmt);
6558       gcc_assert (res);
6559       phi_info = loop_vinfo->lookup_stmt (use_stmt);
6560       stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
6561     }
6562
6563   /* PHIs should not participate in patterns.  */
6564   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6565   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
6566
6567   /* Verify following REDUC_IDX from the latch def leads us back to the PHI
6568      and compute the reduction chain length.  Discover the real
6569      reduction operation stmt on the way (stmt_info and slp_for_stmt_info).  */
6570   tree reduc_def
6571     = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
6572                              loop_latch_edge
6573                                (gimple_bb (reduc_def_phi)->loop_father));
6574   unsigned reduc_chain_length = 0;
6575   bool only_slp_reduc_chain = true;
6576   stmt_info = NULL;
6577   slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
6578   while (reduc_def != PHI_RESULT (reduc_def_phi))
6579     {
6580       stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
6581       stmt_vec_info vdef = vect_stmt_to_vectorize (def);
6582       if (STMT_VINFO_REDUC_IDX (vdef) == -1)
6583         {
6584           if (dump_enabled_p ())
6585             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6586                              "reduction chain broken by patterns.\n");
6587           return false;
6588         }
6589       if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
6590         only_slp_reduc_chain = false;
6591       /* ???  For epilogue generation live members of the chain need
6592          to point back to the PHI via their original stmt for
6593          info_for_reduction to work.  */
6594       if (STMT_VINFO_LIVE_P (vdef))
6595         STMT_VINFO_REDUC_DEF (def) = phi_info;
6596       gassign *assign = dyn_cast <gassign *> (vdef->stmt);
6597       if (!assign)
6598         {
6599           if (dump_enabled_p ())
6600             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6601                              "reduction chain includes calls.\n");
6602           return false;
6603         }
6604       if (CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign)))
6605         {
6606           if (!tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (assign)),
6607                                       TREE_TYPE (gimple_assign_rhs1 (assign))))
6608             {
6609               if (dump_enabled_p ())
6610                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6611                                  "conversion in the reduction chain.\n");
6612               return false;
6613             }
6614         }
6615       else if (!stmt_info)
6616         /* First non-conversion stmt.  */
6617         stmt_info = vdef;
6618       reduc_def = gimple_op (vdef->stmt, 1 + STMT_VINFO_REDUC_IDX (vdef));
6619       reduc_chain_length++;
6620       if (!stmt_info && slp_node)
6621         slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
6622     }
6623   /* PHIs should not participate in patterns.  */
6624   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6625
6626   if (nested_in_vect_loop_p (loop, stmt_info))
6627     {
6628       loop = loop->inner;
6629       nested_cycle = true;
6630     }
6631
6632   /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
6633      element.  */
6634   if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6635     {
6636       gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
6637       stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
6638     }
6639   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6640     gcc_assert (slp_node
6641                 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
6642
6643   /* 1. Is vectorizable reduction?  */
6644   /* Not supportable if the reduction variable is used in the loop, unless
6645      it's a reduction chain.  */
6646   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6647       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6648     return false;
6649
6650   /* Reductions that are not used even in an enclosing outer-loop,
6651      are expected to be "live" (used out of the loop).  */
6652   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6653       && !STMT_VINFO_LIVE_P (stmt_info))
6654     return false;
6655
6656   /* 2. Has this been recognized as a reduction pattern?
6657
6658      Check if STMT represents a pattern that has been recognized
6659      in earlier analysis stages.  For stmts that represent a pattern,
6660      the STMT_VINFO_RELATED_STMT field records the last stmt in
6661      the original sequence that constitutes the pattern.  */
6662
6663   stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6664   if (orig_stmt_info)
6665     {
6666       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6667       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6668     }
6669
6670   /* 3. Check the operands of the operation.  The first operands are defined
6671         inside the loop body. The last operand is the reduction variable,
6672         which is defined by the loop-header-phi.  */
6673
6674   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6675   STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
6676   gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6677   enum tree_code code = gimple_assign_rhs_code (stmt);
6678   bool lane_reduc_code_p
6679     = (code == DOT_PROD_EXPR || code == WIDEN_SUM_EXPR || code == SAD_EXPR);
6680   int op_type = TREE_CODE_LENGTH (code);
6681   enum optab_subtype optab_query_kind = optab_vector;
6682   if (code == DOT_PROD_EXPR
6683       && TYPE_SIGN (TREE_TYPE (gimple_assign_rhs1 (stmt)))
6684            != TYPE_SIGN (TREE_TYPE (gimple_assign_rhs2 (stmt))))
6685     optab_query_kind = optab_vector_mixed_sign;
6686
6687
6688   scalar_dest = gimple_assign_lhs (stmt);
6689   scalar_type = TREE_TYPE (scalar_dest);
6690   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6691       && !SCALAR_FLOAT_TYPE_P (scalar_type))
6692     return false;
6693
6694   /* Do not try to vectorize bit-precision reductions.  */
6695   if (!type_has_mode_precision_p (scalar_type))
6696     return false;
6697
6698   /* For lane-reducing ops we're reducing the number of reduction PHIs
6699      which means the only use of that may be in the lane-reducing operation.  */
6700   if (lane_reduc_code_p
6701       && reduc_chain_length != 1
6702       && !only_slp_reduc_chain)
6703     {
6704       if (dump_enabled_p ())
6705         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6706                          "lane-reducing reduction with extra stmts.\n");
6707       return false;
6708     }
6709
6710   /* All uses but the last are expected to be defined in the loop.
6711      The last use is the reduction variable.  In case of nested cycle this
6712      assumption is not true: we use reduc_index to record the index of the
6713      reduction variable.  */
6714   slp_tree *slp_op = XALLOCAVEC (slp_tree, op_type);
6715   /* We need to skip an extra operand for COND_EXPRs with embedded
6716      comparison.  */
6717   unsigned opno_adjust = 0;
6718   if (code == COND_EXPR
6719       && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt)))
6720     opno_adjust = 1;
6721   for (i = 0; i < op_type; i++)
6722     {
6723       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
6724       if (i == 0 && code == COND_EXPR)
6725         continue;
6726
6727       stmt_vec_info def_stmt_info;
6728       enum vect_def_type dt;
6729       tree op;
6730       if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info,
6731                                i + opno_adjust, &op, &slp_op[i], &dt, &tem,
6732                                &def_stmt_info))
6733         {
6734           if (dump_enabled_p ())
6735             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6736                              "use not simple.\n");
6737           return false;
6738         }
6739       if (i == STMT_VINFO_REDUC_IDX (stmt_info))
6740         continue;
6741
6742       /* There should be only one cycle def in the stmt, the one
6743          leading to reduc_def.  */
6744       if (VECTORIZABLE_CYCLE_DEF (dt))
6745         return false;
6746
6747       /* To properly compute ncopies we are interested in the widest
6748          non-reduction input type in case we're looking at a widening
6749          accumulation that we later handle in vect_transform_reduction.  */
6750       if (lane_reduc_code_p
6751           && tem
6752           && (!vectype_in
6753               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6754                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem))))))
6755         vectype_in = tem;
6756
6757       if (code == COND_EXPR)
6758         {
6759           /* Record how the non-reduction-def value of COND_EXPR is defined.  */
6760           if (dt == vect_constant_def)
6761             {
6762               cond_reduc_dt = dt;
6763               cond_reduc_val = op;
6764             }
6765           if (dt == vect_induction_def
6766               && def_stmt_info
6767               && is_nonwrapping_integer_induction (def_stmt_info, loop))
6768             {
6769               cond_reduc_dt = dt;
6770               cond_stmt_vinfo = def_stmt_info;
6771             }
6772         }
6773     }
6774   if (!vectype_in)
6775     vectype_in = STMT_VINFO_VECTYPE (phi_info);
6776   STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
6777
6778   enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
6779   STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
6780   /* If we have a condition reduction, see if we can simplify it further.  */
6781   if (v_reduc_type == COND_REDUCTION)
6782     {
6783       if (slp_node)
6784         return false;
6785
6786       /* When the condition uses the reduction value in the condition, fail.  */
6787       if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
6788         {
6789           if (dump_enabled_p ())
6790             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6791                              "condition depends on previous iteration\n");
6792           return false;
6793         }
6794
6795       if (reduc_chain_length == 1
6796           && direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6797                                              vectype_in, OPTIMIZE_FOR_SPEED))
6798         {
6799           if (dump_enabled_p ())
6800             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6801                              "optimizing condition reduction with"
6802                              " FOLD_EXTRACT_LAST.\n");
6803           STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
6804         }
6805       else if (cond_reduc_dt == vect_induction_def)
6806         {
6807           tree base
6808             = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6809           tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6810
6811           gcc_assert (TREE_CODE (base) == INTEGER_CST
6812                       && TREE_CODE (step) == INTEGER_CST);
6813           cond_reduc_val = NULL_TREE;
6814           enum tree_code cond_reduc_op_code = ERROR_MARK;
6815           tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
6816           if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
6817             ;
6818           /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6819              above base; punt if base is the minimum value of the type for
6820              MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
6821           else if (tree_int_cst_sgn (step) == -1)
6822             {
6823               cond_reduc_op_code = MIN_EXPR;
6824               if (tree_int_cst_sgn (base) == -1)
6825                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6826               else if (tree_int_cst_lt (base,
6827                                         TYPE_MAX_VALUE (TREE_TYPE (base))))
6828                 cond_reduc_val
6829                   = int_const_binop (PLUS_EXPR, base, integer_one_node);
6830             }
6831           else
6832             {
6833               cond_reduc_op_code = MAX_EXPR;
6834               if (tree_int_cst_sgn (base) == 1)
6835                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6836               else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6837                                         base))
6838                 cond_reduc_val
6839                   = int_const_binop (MINUS_EXPR, base, integer_one_node);
6840             }
6841           if (cond_reduc_val)
6842             {
6843               if (dump_enabled_p ())
6844                 dump_printf_loc (MSG_NOTE, vect_location,
6845                                  "condition expression based on "
6846                                  "integer induction.\n");
6847               STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
6848               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
6849                 = cond_reduc_val;
6850               STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
6851             }
6852         }
6853       else if (cond_reduc_dt == vect_constant_def)
6854         {
6855           enum vect_def_type cond_initial_dt;
6856           tree cond_initial_val = vect_phi_initial_value (reduc_def_phi);
6857           vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
6858           if (cond_initial_dt == vect_constant_def
6859               && types_compatible_p (TREE_TYPE (cond_initial_val),
6860                                      TREE_TYPE (cond_reduc_val)))
6861             {
6862               tree e = fold_binary (LE_EXPR, boolean_type_node,
6863                                     cond_initial_val, cond_reduc_val);
6864               if (e && (integer_onep (e) || integer_zerop (e)))
6865                 {
6866                   if (dump_enabled_p ())
6867                     dump_printf_loc (MSG_NOTE, vect_location,
6868                                      "condition expression based on "
6869                                      "compile time constant.\n");
6870                   /* Record reduction code at analysis stage.  */
6871                   STMT_VINFO_REDUC_CODE (reduc_info)
6872                     = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6873                   STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
6874                 }
6875             }
6876         }
6877     }
6878
6879   if (STMT_VINFO_LIVE_P (phi_info))
6880     return false;
6881
6882   if (slp_node)
6883     ncopies = 1;
6884   else
6885     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6886
6887   gcc_assert (ncopies >= 1);
6888
6889   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6890
6891   if (nested_cycle)
6892     {
6893       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
6894                   == vect_double_reduction_def);
6895       double_reduc = true;
6896     }
6897
6898   /* 4.2. Check support for the epilog operation.
6899
6900           If STMT represents a reduction pattern, then the type of the
6901           reduction variable may be different than the type of the rest
6902           of the arguments.  For example, consider the case of accumulation
6903           of shorts into an int accumulator; The original code:
6904                         S1: int_a = (int) short_a;
6905           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
6906
6907           was replaced with:
6908                         STMT: int_acc = widen_sum <short_a, int_acc>
6909
6910           This means that:
6911           1. The tree-code that is used to create the vector operation in the
6912              epilog code (that reduces the partial results) is not the
6913              tree-code of STMT, but is rather the tree-code of the original
6914              stmt from the pattern that STMT is replacing.  I.e, in the example
6915              above we want to use 'widen_sum' in the loop, but 'plus' in the
6916              epilog.
6917           2. The type (mode) we use to check available target support
6918              for the vector operation to be created in the *epilog*, is
6919              determined by the type of the reduction variable (in the example
6920              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6921              However the type (mode) we use to check available target support
6922              for the vector operation to be created *inside the loop*, is
6923              determined by the type of the other arguments to STMT (in the
6924              example we'd check this: optab_handler (widen_sum_optab,
6925              vect_short_mode)).
6926
6927           This is contrary to "regular" reductions, in which the types of all
6928           the arguments are the same as the type of the reduction variable.
6929           For "regular" reductions we can therefore use the same vector type
6930           (and also the same tree-code) when generating the epilog code and
6931           when generating the code inside the loop.  */
6932
6933   enum tree_code orig_code = STMT_VINFO_REDUC_CODE (phi_info);
6934   STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
6935
6936   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
6937   if (reduction_type == TREE_CODE_REDUCTION)
6938     {
6939       /* Check whether it's ok to change the order of the computation.
6940          Generally, when vectorizing a reduction we change the order of the
6941          computation.  This may change the behavior of the program in some
6942          cases, so we need to check that this is ok.  One exception is when
6943          vectorizing an outer-loop: the inner-loop is executed sequentially,
6944          and therefore vectorizing reductions in the inner-loop during
6945          outer-loop vectorization is safe.  Likewise when we are vectorizing
6946          a series of reductions using SLP and the VF is one the reductions
6947          are performed in scalar order.  */
6948       if (slp_node
6949           && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6950           && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1u))
6951         ;
6952       else if (needs_fold_left_reduction_p (scalar_type, orig_code))
6953         {
6954           /* When vectorizing a reduction chain w/o SLP the reduction PHI
6955              is not directy used in stmt.  */
6956           if (!only_slp_reduc_chain
6957               && reduc_chain_length != 1)
6958             {
6959               if (dump_enabled_p ())
6960                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6961                                  "in-order reduction chain without SLP.\n");
6962               return false;
6963             }
6964           STMT_VINFO_REDUC_TYPE (reduc_info)
6965             = reduction_type = FOLD_LEFT_REDUCTION;
6966         }
6967       else if (!commutative_tree_code (orig_code)
6968                || !associative_tree_code (orig_code))
6969         {
6970           if (dump_enabled_p ())
6971             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6972                             "reduction: not commutative/associative");
6973           return false;
6974         }
6975     }
6976
6977   if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
6978       && ncopies > 1)
6979     {
6980       if (dump_enabled_p ())
6981         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6982                          "multiple types in double reduction or condition "
6983                          "reduction or fold-left reduction.\n");
6984       return false;
6985     }
6986
6987   internal_fn reduc_fn = IFN_LAST;
6988   if (reduction_type == TREE_CODE_REDUCTION
6989       || reduction_type == FOLD_LEFT_REDUCTION
6990       || reduction_type == INTEGER_INDUC_COND_REDUCTION
6991       || reduction_type == CONST_COND_REDUCTION)
6992     {
6993       if (reduction_type == FOLD_LEFT_REDUCTION
6994           ? fold_left_reduction_fn (orig_code, &reduc_fn)
6995           : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6996         {
6997           if (reduc_fn != IFN_LAST
6998               && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6999                                                   OPTIMIZE_FOR_SPEED))
7000             {
7001               if (dump_enabled_p ())
7002                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7003                                  "reduc op not supported by target.\n");
7004
7005               reduc_fn = IFN_LAST;
7006             }
7007         }
7008       else
7009         {
7010           if (!nested_cycle || double_reduc)
7011             {
7012               if (dump_enabled_p ())
7013                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7014                                  "no reduc code for scalar code.\n");
7015
7016               return false;
7017             }
7018         }
7019     }
7020   else if (reduction_type == COND_REDUCTION)
7021     {
7022       int scalar_precision
7023         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
7024       cr_index_scalar_type = make_unsigned_type (scalar_precision);
7025       cr_index_vector_type = get_same_sized_vectype (cr_index_scalar_type,
7026                                                 vectype_out);
7027
7028       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
7029                                           OPTIMIZE_FOR_SPEED))
7030         reduc_fn = IFN_REDUC_MAX;
7031     }
7032   STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
7033
7034   if (reduction_type != EXTRACT_LAST_REDUCTION
7035       && (!nested_cycle || double_reduc)
7036       && reduc_fn == IFN_LAST
7037       && !nunits_out.is_constant ())
7038     {
7039       if (dump_enabled_p ())
7040         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7041                          "missing target support for reduction on"
7042                          " variable-length vectors.\n");
7043       return false;
7044     }
7045
7046   /* For SLP reductions, see if there is a neutral value we can use.  */
7047   tree neutral_op = NULL_TREE;
7048   if (slp_node)
7049     {
7050       tree initial_value = NULL_TREE;
7051       if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL)
7052         initial_value = vect_phi_initial_value (reduc_def_phi);
7053       neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype_out),
7054                                              orig_code, initial_value);
7055     }
7056
7057   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
7058     {
7059       /* We can't support in-order reductions of code such as this:
7060
7061            for (int i = 0; i < n1; ++i)
7062              for (int j = 0; j < n2; ++j)
7063                l += a[j];
7064
7065          since GCC effectively transforms the loop when vectorizing:
7066
7067            for (int i = 0; i < n1 / VF; ++i)
7068              for (int j = 0; j < n2; ++j)
7069                for (int k = 0; k < VF; ++k)
7070                  l += a[j];
7071
7072          which is a reassociation of the original operation.  */
7073       if (dump_enabled_p ())
7074         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7075                          "in-order double reduction not supported.\n");
7076
7077       return false;
7078     }
7079
7080   if (reduction_type == FOLD_LEFT_REDUCTION
7081       && slp_node
7082       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7083     {
7084       /* We cannot use in-order reductions in this case because there is
7085          an implicit reassociation of the operations involved.  */
7086       if (dump_enabled_p ())
7087         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7088                          "in-order unchained SLP reductions not supported.\n");
7089       return false;
7090     }
7091
7092   /* For double reductions, and for SLP reductions with a neutral value,
7093      we construct a variable-length initial vector by loading a vector
7094      full of the neutral value and then shift-and-inserting the start
7095      values into the low-numbered elements.  */
7096   if ((double_reduc || neutral_op)
7097       && !nunits_out.is_constant ()
7098       && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
7099                                           vectype_out, OPTIMIZE_FOR_SPEED))
7100     {
7101       if (dump_enabled_p ())
7102         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7103                          "reduction on variable-length vectors requires"
7104                          " target support for a vector-shift-and-insert"
7105                          " operation.\n");
7106       return false;
7107     }
7108
7109   /* Check extra constraints for variable-length unchained SLP reductions.  */
7110   if (STMT_SLP_TYPE (stmt_info)
7111       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7112       && !nunits_out.is_constant ())
7113     {
7114       /* We checked above that we could build the initial vector when
7115          there's a neutral element value.  Check here for the case in
7116          which each SLP statement has its own initial value and in which
7117          that value needs to be repeated for every instance of the
7118          statement within the initial vector.  */
7119       unsigned int group_size = SLP_TREE_LANES (slp_node);
7120       if (!neutral_op
7121           && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
7122                                               TREE_TYPE (vectype_out)))
7123         {
7124           if (dump_enabled_p ())
7125             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7126                              "unsupported form of SLP reduction for"
7127                              " variable-length vectors: cannot build"
7128                              " initial vector.\n");
7129           return false;
7130         }
7131       /* The epilogue code relies on the number of elements being a multiple
7132          of the group size.  The duplicate-and-interleave approach to setting
7133          up the initial vector does too.  */
7134       if (!multiple_p (nunits_out, group_size))
7135         {
7136           if (dump_enabled_p ())
7137             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7138                              "unsupported form of SLP reduction for"
7139                              " variable-length vectors: the vector size"
7140                              " is not a multiple of the number of results.\n");
7141           return false;
7142         }
7143     }
7144
7145   if (reduction_type == COND_REDUCTION)
7146     {
7147       widest_int ni;
7148
7149       if (! max_loop_iterations (loop, &ni))
7150         {
7151           if (dump_enabled_p ())
7152             dump_printf_loc (MSG_NOTE, vect_location,
7153                              "loop count not known, cannot create cond "
7154                              "reduction.\n");
7155           return false;
7156         }
7157       /* Convert backedges to iterations.  */
7158       ni += 1;
7159
7160       /* The additional index will be the same type as the condition.  Check
7161          that the loop can fit into this less one (because we'll use up the
7162          zero slot for when there are no matches).  */
7163       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
7164       if (wi::geu_p (ni, wi::to_widest (max_index)))
7165         {
7166           if (dump_enabled_p ())
7167             dump_printf_loc (MSG_NOTE, vect_location,
7168                              "loop size is greater than data size.\n");
7169           return false;
7170         }
7171     }
7172
7173   /* In case the vectorization factor (VF) is bigger than the number
7174      of elements that we can fit in a vectype (nunits), we have to generate
7175      more than one vector stmt - i.e - we need to "unroll" the
7176      vector stmt by a factor VF/nunits.  For more details see documentation
7177      in vectorizable_operation.  */
7178
7179   /* If the reduction is used in an outer loop we need to generate
7180      VF intermediate results, like so (e.g. for ncopies=2):
7181         r0 = phi (init, r0)
7182         r1 = phi (init, r1)
7183         r0 = x0 + r0;
7184         r1 = x1 + r1;
7185     (i.e. we generate VF results in 2 registers).
7186     In this case we have a separate def-use cycle for each copy, and therefore
7187     for each copy we get the vector def for the reduction variable from the
7188     respective phi node created for this copy.
7189
7190     Otherwise (the reduction is unused in the loop nest), we can combine
7191     together intermediate results, like so (e.g. for ncopies=2):
7192         r = phi (init, r)
7193         r = x0 + r;
7194         r = x1 + r;
7195    (i.e. we generate VF/2 results in a single register).
7196    In this case for each copy we get the vector def for the reduction variable
7197    from the vectorized reduction operation generated in the previous iteration.
7198
7199    This only works when we see both the reduction PHI and its only consumer
7200    in vectorizable_reduction and there are no intermediate stmts
7201    participating.  */
7202   if (ncopies > 1
7203       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
7204       && reduc_chain_length == 1)
7205     single_defuse_cycle = true;
7206
7207   if (single_defuse_cycle || lane_reduc_code_p)
7208     {
7209       gcc_assert (code != COND_EXPR);
7210
7211       /* 4. Supportable by target?  */
7212       bool ok = true;
7213
7214       /* 4.1. check support for the operation in the loop  */
7215       optab optab = optab_for_tree_code (code, vectype_in, optab_query_kind);
7216       if (!optab)
7217         {
7218           if (dump_enabled_p ())
7219             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7220                              "no optab.\n");
7221           ok = false;
7222         }
7223
7224       machine_mode vec_mode = TYPE_MODE (vectype_in);
7225       if (ok && optab_handler (optab, vec_mode) == CODE_FOR_nothing)
7226         {
7227           if (dump_enabled_p ())
7228             dump_printf (MSG_NOTE, "op not supported by target.\n");
7229           if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
7230               || !vect_can_vectorize_without_simd_p (code))
7231             ok = false;
7232           else
7233             if (dump_enabled_p ())
7234               dump_printf (MSG_NOTE, "proceeding using word mode.\n");
7235         }
7236
7237       if (vect_emulated_vector_p (vectype_in)
7238           && !vect_can_vectorize_without_simd_p (code))
7239         {
7240           if (dump_enabled_p ())
7241             dump_printf (MSG_NOTE, "using word mode not possible.\n");
7242           return false;
7243         }
7244
7245       /* lane-reducing operations have to go through vect_transform_reduction.
7246          For the other cases try without the single cycle optimization.  */
7247       if (!ok)
7248         {
7249           if (lane_reduc_code_p)
7250             return false;
7251           else
7252             single_defuse_cycle = false;
7253         }
7254     }
7255   STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
7256
7257   /* If the reduction stmt is one of the patterns that have lane
7258      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
7259   if ((ncopies > 1 && ! single_defuse_cycle)
7260       && lane_reduc_code_p)
7261     {
7262       if (dump_enabled_p ())
7263         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7264                          "multi def-use cycle not possible for lane-reducing "
7265                          "reduction operation\n");
7266       return false;
7267     }
7268
7269   if (slp_node
7270       && !(!single_defuse_cycle
7271            && code != DOT_PROD_EXPR
7272            && code != WIDEN_SUM_EXPR
7273            && code != SAD_EXPR
7274            && reduction_type != FOLD_LEFT_REDUCTION))
7275     for (i = 0; i < op_type; i++)
7276       if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_in))
7277         {
7278           if (dump_enabled_p ())
7279             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7280                              "incompatible vector types for invariants\n");
7281           return false;
7282         }
7283
7284   if (slp_node)
7285     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7286   else
7287     vec_num = 1;
7288
7289   vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
7290                              reduction_type, ncopies, cost_vec);
7291   /* Cost the reduction op inside the loop if transformed via
7292      vect_transform_reduction.  Otherwise this is costed by the
7293      separate vectorizable_* routines.  */
7294   if (single_defuse_cycle
7295       || code == DOT_PROD_EXPR
7296       || code == WIDEN_SUM_EXPR
7297       || code == SAD_EXPR)
7298     record_stmt_cost (cost_vec, ncopies, vector_stmt, stmt_info, 0, vect_body);
7299
7300   if (dump_enabled_p ()
7301       && reduction_type == FOLD_LEFT_REDUCTION)
7302     dump_printf_loc (MSG_NOTE, vect_location,
7303                      "using an in-order (fold-left) reduction.\n");
7304   STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
7305   /* All but single defuse-cycle optimized, lane-reducing and fold-left
7306      reductions go through their own vectorizable_* routines.  */
7307   if (!single_defuse_cycle
7308       && code != DOT_PROD_EXPR
7309       && code != WIDEN_SUM_EXPR
7310       && code != SAD_EXPR
7311       && reduction_type != FOLD_LEFT_REDUCTION)
7312     {
7313       stmt_vec_info tem
7314         = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
7315       if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
7316         {
7317           gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
7318           tem = REDUC_GROUP_FIRST_ELEMENT (tem);
7319         }
7320       STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
7321       STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
7322     }
7323   else if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
7324     {
7325       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7326       internal_fn cond_fn = get_conditional_internal_fn (code);
7327
7328       if (reduction_type != FOLD_LEFT_REDUCTION
7329           && !use_mask_by_cond_expr_p (code, cond_fn, vectype_in)
7330           && (cond_fn == IFN_LAST
7331               || !direct_internal_fn_supported_p (cond_fn, vectype_in,
7332                                                   OPTIMIZE_FOR_SPEED)))
7333         {
7334           if (dump_enabled_p ())
7335             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7336                              "can't operate on partial vectors because"
7337                              " no conditional operation is available.\n");
7338           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7339         }
7340       else if (reduction_type == FOLD_LEFT_REDUCTION
7341                && reduc_fn == IFN_LAST
7342                && !expand_vec_cond_expr_p (vectype_in,
7343                                            truth_type_for (vectype_in),
7344                                            SSA_NAME))
7345         {
7346           if (dump_enabled_p ())
7347             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7348                              "can't operate on partial vectors because"
7349                              " no conditional operation is available.\n");
7350           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7351         }
7352       else
7353         vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
7354                                vectype_in, NULL);
7355     }
7356   return true;
7357 }
7358
7359 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
7360    value.  */
7361
7362 bool
7363 vect_transform_reduction (loop_vec_info loop_vinfo,
7364                           stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
7365                           gimple **vec_stmt, slp_tree slp_node)
7366 {
7367   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7368   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7369   int i;
7370   int ncopies;
7371   int vec_num;
7372
7373   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7374   gcc_assert (reduc_info->is_reduc_info);
7375
7376   if (nested_in_vect_loop_p (loop, stmt_info))
7377     {
7378       loop = loop->inner;
7379       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
7380     }
7381
7382   gassign *stmt = as_a <gassign *> (stmt_info->stmt);
7383   enum tree_code code = gimple_assign_rhs_code (stmt);
7384   int op_type = TREE_CODE_LENGTH (code);
7385
7386   /* Flatten RHS.  */
7387   tree ops[3];
7388   switch (get_gimple_rhs_class (code))
7389     {
7390     case GIMPLE_TERNARY_RHS:
7391       ops[2] = gimple_assign_rhs3 (stmt);
7392       /* Fall thru.  */
7393     case GIMPLE_BINARY_RHS:
7394       ops[0] = gimple_assign_rhs1 (stmt);
7395       ops[1] = gimple_assign_rhs2 (stmt);
7396       break;
7397     default:
7398       gcc_unreachable ();
7399     }
7400
7401   /* All uses but the last are expected to be defined in the loop.
7402      The last use is the reduction variable.  In case of nested cycle this
7403      assumption is not true: we use reduc_index to record the index of the
7404      reduction variable.  */
7405   stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
7406   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
7407   int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
7408   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7409
7410   if (slp_node)
7411     {
7412       ncopies = 1;
7413       vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7414     }
7415   else
7416     {
7417       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7418       vec_num = 1;
7419     }
7420
7421   internal_fn cond_fn = get_conditional_internal_fn (code);
7422   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7423   bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
7424
7425   /* Transform.  */
7426   tree new_temp = NULL_TREE;
7427   auto_vec<tree> vec_oprnds0;
7428   auto_vec<tree> vec_oprnds1;
7429   auto_vec<tree> vec_oprnds2;
7430   tree def0;
7431
7432   if (dump_enabled_p ())
7433     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7434
7435   /* FORNOW: Multiple types are not supported for condition.  */
7436   if (code == COND_EXPR)
7437     gcc_assert (ncopies == 1);
7438
7439   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7440
7441   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7442   if (reduction_type == FOLD_LEFT_REDUCTION)
7443     {
7444       internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
7445       return vectorize_fold_left_reduction
7446           (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, code,
7447            reduc_fn, ops, vectype_in, reduc_index, masks);
7448     }
7449
7450   bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
7451   gcc_assert (single_defuse_cycle
7452               || code == DOT_PROD_EXPR
7453               || code == WIDEN_SUM_EXPR
7454               || code == SAD_EXPR);
7455
7456   /* Create the destination vector  */
7457   tree scalar_dest = gimple_assign_lhs (stmt);
7458   tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7459
7460   vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
7461                      single_defuse_cycle && reduc_index == 0
7462                      ? NULL_TREE : ops[0], &vec_oprnds0,
7463                      single_defuse_cycle && reduc_index == 1
7464                      ? NULL_TREE : ops[1], &vec_oprnds1,
7465                      op_type == ternary_op
7466                      && !(single_defuse_cycle && reduc_index == 2)
7467                      ? ops[2] : NULL_TREE, &vec_oprnds2);
7468   if (single_defuse_cycle)
7469     {
7470       gcc_assert (!slp_node);
7471       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7472                                      ops[reduc_index],
7473                                      reduc_index == 0 ? &vec_oprnds0
7474                                      : (reduc_index == 1 ? &vec_oprnds1
7475                                         : &vec_oprnds2));
7476     }
7477
7478   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7479     {
7480       gimple *new_stmt;
7481       tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7482       if (masked_loop_p && !mask_by_cond_expr)
7483         {
7484           /* Make sure that the reduction accumulator is vop[0].  */
7485           if (reduc_index == 1)
7486             {
7487               gcc_assert (commutative_tree_code (code));
7488               std::swap (vop[0], vop[1]);
7489             }
7490           tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7491                                           vectype_in, i);
7492           gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
7493                                                     vop[0], vop[1], vop[0]);
7494           new_temp = make_ssa_name (vec_dest, call);
7495           gimple_call_set_lhs (call, new_temp);
7496           gimple_call_set_nothrow (call, true);
7497           vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
7498           new_stmt = call;
7499         }
7500       else
7501         {
7502           if (op_type == ternary_op)
7503             vop[2] = vec_oprnds2[i];
7504
7505           if (masked_loop_p && mask_by_cond_expr)
7506             {
7507               tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7508                                               vectype_in, i);
7509               build_vect_cond_expr (code, vop, mask, gsi);
7510             }
7511
7512           new_stmt = gimple_build_assign (vec_dest, code,
7513                                           vop[0], vop[1], vop[2]);
7514           new_temp = make_ssa_name (vec_dest, new_stmt);
7515           gimple_assign_set_lhs (new_stmt, new_temp);
7516           vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7517         }
7518
7519       if (slp_node)
7520         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7521       else if (single_defuse_cycle
7522                && i < ncopies - 1)
7523         {
7524           if (reduc_index == 0)
7525             vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
7526           else if (reduc_index == 1)
7527             vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
7528           else if (reduc_index == 2)
7529             vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
7530         }
7531       else
7532         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
7533     }
7534
7535   if (!slp_node)
7536     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
7537
7538   return true;
7539 }
7540
7541 /* Transform phase of a cycle PHI.  */
7542
7543 bool
7544 vect_transform_cycle_phi (loop_vec_info loop_vinfo,
7545                           stmt_vec_info stmt_info, gimple **vec_stmt,
7546                           slp_tree slp_node, slp_instance slp_node_instance)
7547 {
7548   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7549   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7550   int i;
7551   int ncopies;
7552   int j;
7553   bool nested_cycle = false;
7554   int vec_num;
7555
7556   if (nested_in_vect_loop_p (loop, stmt_info))
7557     {
7558       loop = loop->inner;
7559       nested_cycle = true;
7560     }
7561
7562   stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
7563   reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
7564   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7565   gcc_assert (reduc_info->is_reduc_info);
7566
7567   if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
7568       || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
7569     /* Leave the scalar phi in place.  */
7570     return true;
7571
7572   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7573   /* For a nested cycle we do not fill the above.  */
7574   if (!vectype_in)
7575     vectype_in = STMT_VINFO_VECTYPE (stmt_info);
7576   gcc_assert (vectype_in);
7577
7578   if (slp_node)
7579     {
7580       /* The size vect_schedule_slp_instance computes is off for us.  */
7581       vec_num = vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
7582                                       * SLP_TREE_LANES (slp_node), vectype_in);
7583       ncopies = 1;
7584     }
7585   else
7586     {
7587       vec_num = 1;
7588       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7589     }
7590
7591   /* Check whether we should use a single PHI node and accumulate
7592      vectors to one before the backedge.  */
7593   if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
7594     ncopies = 1;
7595
7596   /* Create the destination vector  */
7597   gphi *phi = as_a <gphi *> (stmt_info->stmt);
7598   tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
7599                                                vectype_out);
7600
7601   /* Get the loop-entry arguments.  */
7602   tree vec_initial_def = NULL_TREE;
7603   auto_vec<tree> vec_initial_defs;
7604   if (slp_node)
7605     {
7606       vec_initial_defs.reserve (vec_num);
7607       if (nested_cycle)
7608         {
7609           unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
7610           vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
7611                              &vec_initial_defs);
7612         }
7613       else
7614         {
7615           gcc_assert (slp_node == slp_node_instance->reduc_phis);
7616           vec<tree> &initial_values = reduc_info->reduc_initial_values;
7617           vec<stmt_vec_info> &stmts = SLP_TREE_SCALAR_STMTS (slp_node);
7618
7619           unsigned int num_phis = stmts.length ();
7620           if (REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
7621             num_phis = 1;
7622           initial_values.reserve (num_phis);
7623           for (unsigned int i = 0; i < num_phis; ++i)
7624             {
7625               gphi *this_phi = as_a<gphi *> (stmts[i]->stmt);
7626               initial_values.quick_push (vect_phi_initial_value (this_phi));
7627             }
7628           if (vec_num == 1)
7629             vect_find_reusable_accumulator (loop_vinfo, reduc_info);
7630           if (!initial_values.is_empty ())
7631             {
7632               tree initial_value
7633                 = (num_phis == 1 ? initial_values[0] : NULL_TREE);
7634               tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
7635               tree neutral_op
7636                 = neutral_op_for_reduction (TREE_TYPE (vectype_out),
7637                                             code, initial_value);
7638               get_initial_defs_for_reduction (loop_vinfo, reduc_info,
7639                                               &vec_initial_defs, vec_num,
7640                                               stmts.length (), neutral_op);
7641             }
7642         }
7643     }
7644   else
7645     {
7646       /* Get at the scalar def before the loop, that defines the initial
7647          value of the reduction variable.  */
7648       tree initial_def = vect_phi_initial_value (phi);
7649       reduc_info->reduc_initial_values.safe_push (initial_def);
7650       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
7651          and we can't use zero for induc_val, use initial_def.  Similarly
7652          for REDUC_MIN and initial_def larger than the base.  */
7653       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
7654         {
7655           tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
7656           if (TREE_CODE (initial_def) == INTEGER_CST
7657               && !integer_zerop (induc_val)
7658               && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
7659                    && tree_int_cst_lt (initial_def, induc_val))
7660                   || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
7661                       && tree_int_cst_lt (induc_val, initial_def))))
7662             {
7663               induc_val = initial_def;
7664               /* Communicate we used the initial_def to epilouge
7665                  generation.  */
7666               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
7667             }
7668           vec_initial_def = build_vector_from_val (vectype_out, induc_val);
7669         }
7670       else if (nested_cycle)
7671         {
7672           /* Do not use an adjustment def as that case is not supported
7673              correctly if ncopies is not one.  */
7674           vect_get_vec_defs_for_operand (loop_vinfo, reduc_stmt_info,
7675                                          ncopies, initial_def,
7676                                          &vec_initial_defs);
7677         }
7678       else if (STMT_VINFO_REDUC_TYPE (reduc_info) == CONST_COND_REDUCTION
7679                || STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
7680         /* Fill the initial vector with the initial scalar value.  */
7681         vec_initial_def
7682           = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info,
7683                                            initial_def, initial_def);
7684       else
7685         {
7686           if (ncopies == 1)
7687             vect_find_reusable_accumulator (loop_vinfo, reduc_info);
7688           if (!reduc_info->reduc_initial_values.is_empty ())
7689             {
7690               initial_def = reduc_info->reduc_initial_values[0];
7691               enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
7692               tree neutral_op
7693                 = neutral_op_for_reduction (TREE_TYPE (initial_def),
7694                                             code, initial_def);
7695               gcc_assert (neutral_op);
7696               /* Try to simplify the vector initialization by applying an
7697                  adjustment after the reduction has been performed.  */
7698               if (!reduc_info->reused_accumulator
7699                   && STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
7700                   && !operand_equal_p (neutral_op, initial_def))
7701                 {
7702                   STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info)
7703                     = initial_def;
7704                   initial_def = neutral_op;
7705                 }
7706               vec_initial_def
7707                 = get_initial_def_for_reduction (loop_vinfo, reduc_info,
7708                                                  initial_def, neutral_op);
7709             }
7710         }
7711     }
7712
7713   if (vec_initial_def)
7714     {
7715       vec_initial_defs.create (ncopies);
7716       for (i = 0; i < ncopies; ++i)
7717         vec_initial_defs.quick_push (vec_initial_def);
7718     }
7719
7720   if (auto *accumulator = reduc_info->reused_accumulator)
7721     {
7722       tree def = accumulator->reduc_input;
7723       if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
7724         {
7725           unsigned int nreduc;
7726           bool res = constant_multiple_p (TYPE_VECTOR_SUBPARTS
7727                                             (TREE_TYPE (def)),
7728                                           TYPE_VECTOR_SUBPARTS (vectype_out),
7729                                           &nreduc);
7730           gcc_assert (res);
7731           gimple_seq stmts = NULL;
7732           /* Reduce the single vector to a smaller one.  */
7733           if (nreduc != 1)
7734             {
7735               /* Perform the reduction in the appropriate type.  */
7736               tree rvectype = vectype_out;
7737               if (!useless_type_conversion_p (TREE_TYPE (vectype_out),
7738                                               TREE_TYPE (TREE_TYPE (def))))
7739                 rvectype = build_vector_type (TREE_TYPE (TREE_TYPE (def)),
7740                                               TYPE_VECTOR_SUBPARTS
7741                                                 (vectype_out));
7742               def = vect_create_partial_epilog (def, rvectype,
7743                                                 STMT_VINFO_REDUC_CODE
7744                                                   (reduc_info),
7745                                                 &stmts);
7746             }
7747           /* Adjust the input so we pick up the partially reduced value
7748              for the skip edge in vect_create_epilog_for_reduction.  */
7749           accumulator->reduc_input = def;
7750           if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
7751             def = gimple_convert (&stmts, vectype_out, def);
7752           if (loop_vinfo->main_loop_edge)
7753             {
7754               /* While we'd like to insert on the edge this will split
7755                  blocks and disturb bookkeeping, we also will eventually
7756                  need this on the skip edge.  Rely on sinking to
7757                  fixup optimal placement and insert in the pred.  */
7758               gimple_stmt_iterator gsi
7759                 = gsi_last_bb (loop_vinfo->main_loop_edge->src);
7760               /* Insert before a cond that eventually skips the
7761                  epilogue.  */
7762               if (!gsi_end_p (gsi) && stmt_ends_bb_p (gsi_stmt (gsi)))
7763                 gsi_prev (&gsi);
7764               gsi_insert_seq_after (&gsi, stmts, GSI_CONTINUE_LINKING);
7765             }
7766           else
7767             gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop),
7768                                               stmts);
7769         }
7770       if (loop_vinfo->main_loop_edge)
7771         vec_initial_defs[0]
7772           = vect_get_main_loop_result (loop_vinfo, def,
7773                                        vec_initial_defs[0]);
7774       else
7775         vec_initial_defs.safe_push (def);
7776     }
7777
7778   /* Generate the reduction PHIs upfront.  */
7779   for (i = 0; i < vec_num; i++)
7780     {
7781       tree vec_init_def = vec_initial_defs[i];
7782       for (j = 0; j < ncopies; j++)
7783         {
7784           /* Create the reduction-phi that defines the reduction
7785              operand.  */
7786           gphi *new_phi = create_phi_node (vec_dest, loop->header);
7787
7788           /* Set the loop-entry arg of the reduction-phi.  */
7789           if (j != 0 && nested_cycle)
7790             vec_init_def = vec_initial_defs[j];
7791           add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
7792                        UNKNOWN_LOCATION);
7793
7794           /* The loop-latch arg is set in epilogue processing.  */
7795
7796           if (slp_node)
7797             SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
7798           else
7799             {
7800               if (j == 0)
7801                 *vec_stmt = new_phi;
7802               STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
7803             }
7804         }
7805     }
7806
7807   return true;
7808 }
7809
7810 /* Vectorizes LC PHIs.  */
7811
7812 bool
7813 vectorizable_lc_phi (loop_vec_info loop_vinfo,
7814                      stmt_vec_info stmt_info, gimple **vec_stmt,
7815                      slp_tree slp_node)
7816 {
7817   if (!loop_vinfo
7818       || !is_a <gphi *> (stmt_info->stmt)
7819       || gimple_phi_num_args (stmt_info->stmt) != 1)
7820     return false;
7821
7822   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
7823       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
7824     return false;
7825
7826   if (!vec_stmt) /* transformation not required.  */
7827     {
7828       /* Deal with copies from externs or constants that disguise as
7829          loop-closed PHI nodes (PR97886).  */
7830       if (slp_node
7831           && !vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0],
7832                                                 SLP_TREE_VECTYPE (slp_node)))
7833         {
7834           if (dump_enabled_p ())
7835             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7836                              "incompatible vector types for invariants\n");
7837           return false;
7838         }
7839       STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
7840       return true;
7841     }
7842
7843   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7844   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
7845   basic_block bb = gimple_bb (stmt_info->stmt);
7846   edge e = single_pred_edge (bb);
7847   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
7848   auto_vec<tree> vec_oprnds;
7849   vect_get_vec_defs (loop_vinfo, stmt_info, slp_node,
7850                      !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1,
7851                      gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
7852   for (unsigned i = 0; i < vec_oprnds.length (); i++)
7853     {
7854       /* Create the vectorized LC PHI node.  */
7855       gphi *new_phi = create_phi_node (vec_dest, bb);
7856       add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
7857       if (slp_node)
7858         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
7859       else
7860         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
7861     }
7862   if (!slp_node)
7863     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
7864
7865   return true;
7866 }
7867
7868 /* Vectorizes PHIs.  */
7869
7870 bool
7871 vectorizable_phi (vec_info *,
7872                   stmt_vec_info stmt_info, gimple **vec_stmt,
7873                   slp_tree slp_node, stmt_vector_for_cost *cost_vec)
7874 {
7875   if (!is_a <gphi *> (stmt_info->stmt) || !slp_node)
7876     return false;
7877
7878   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
7879     return false;
7880
7881   tree vectype = SLP_TREE_VECTYPE (slp_node);
7882
7883   if (!vec_stmt) /* transformation not required.  */
7884     {
7885       slp_tree child;
7886       unsigned i;
7887       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
7888         if (!child)
7889           {
7890             if (dump_enabled_p ())
7891               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7892                                "PHI node with unvectorized backedge def\n");
7893             return false;
7894           }
7895         else if (!vect_maybe_update_slp_op_vectype (child, vectype))
7896           {
7897             if (dump_enabled_p ())
7898               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7899                                "incompatible vector types for invariants\n");
7900             return false;
7901           }
7902       /* For single-argument PHIs assume coalescing which means zero cost
7903          for the scalar and the vector PHIs.  This avoids artificially
7904          favoring the vector path (but may pessimize it in some cases).  */
7905       if (gimple_phi_num_args (as_a <gphi *> (stmt_info->stmt)) > 1)
7906         record_stmt_cost (cost_vec, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
7907                           vector_stmt, stmt_info, vectype, 0, vect_body);
7908       STMT_VINFO_TYPE (stmt_info) = phi_info_type;
7909       return true;
7910     }
7911
7912   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
7913   basic_block bb = gimple_bb (stmt_info->stmt);
7914   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
7915   auto_vec<gphi *> new_phis;
7916   for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i)
7917     {
7918       slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
7919
7920       /* Skip not yet vectorized defs.  */
7921       if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
7922           && SLP_TREE_VEC_STMTS (child).is_empty ())
7923         continue;
7924
7925       auto_vec<tree> vec_oprnds;
7926       vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
7927       if (!new_phis.exists ())
7928         {
7929           new_phis.create (vec_oprnds.length ());
7930           for (unsigned j = 0; j < vec_oprnds.length (); j++)
7931             {
7932               /* Create the vectorized LC PHI node.  */
7933               new_phis.quick_push (create_phi_node (vec_dest, bb));
7934               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phis[j]);
7935             }
7936         }
7937       edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i);
7938       for (unsigned j = 0; j < vec_oprnds.length (); j++)
7939         add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
7940     }
7941   /* We should have at least one already vectorized child.  */
7942   gcc_assert (new_phis.exists ());
7943
7944   return true;
7945 }
7946
7947 /* Return true if VECTYPE represents a vector that requires lowering
7948    by the vector lowering pass.  */
7949
7950 bool
7951 vect_emulated_vector_p (tree vectype)
7952 {
7953   return (!VECTOR_MODE_P (TYPE_MODE (vectype))
7954           && (!VECTOR_BOOLEAN_TYPE_P (vectype)
7955               || TYPE_PRECISION (TREE_TYPE (vectype)) != 1));
7956 }
7957
7958 /* Return true if we can emulate CODE on an integer mode representation
7959    of a vector.  */
7960
7961 bool
7962 vect_can_vectorize_without_simd_p (tree_code code)
7963 {
7964   switch (code)
7965     {
7966     case PLUS_EXPR:
7967     case MINUS_EXPR:
7968     case NEGATE_EXPR:
7969     case BIT_AND_EXPR:
7970     case BIT_IOR_EXPR:
7971     case BIT_XOR_EXPR:
7972     case BIT_NOT_EXPR:
7973       return true;
7974
7975     default:
7976       return false;
7977     }
7978 }
7979
7980 /* Function vectorizable_induction
7981
7982    Check if STMT_INFO performs an induction computation that can be vectorized.
7983    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7984    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7985    Return true if STMT_INFO is vectorizable in this way.  */
7986
7987 bool
7988 vectorizable_induction (loop_vec_info loop_vinfo,
7989                         stmt_vec_info stmt_info,
7990                         gimple **vec_stmt, slp_tree slp_node,
7991                         stmt_vector_for_cost *cost_vec)
7992 {
7993   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7994   unsigned ncopies;
7995   bool nested_in_vect_loop = false;
7996   class loop *iv_loop;
7997   tree vec_def;
7998   edge pe = loop_preheader_edge (loop);
7999   basic_block new_bb;
8000   tree new_vec, vec_init, vec_step, t;
8001   tree new_name;
8002   gimple *new_stmt;
8003   gphi *induction_phi;
8004   tree induc_def, vec_dest;
8005   tree init_expr, step_expr;
8006   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8007   unsigned i;
8008   tree expr;
8009   gimple_stmt_iterator si;
8010
8011   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
8012   if (!phi)
8013     return false;
8014
8015   if (!STMT_VINFO_RELEVANT_P (stmt_info))
8016     return false;
8017
8018   /* Make sure it was recognized as induction computation.  */
8019   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
8020     return false;
8021
8022   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8023   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8024
8025   if (slp_node)
8026     ncopies = 1;
8027   else
8028     ncopies = vect_get_num_copies (loop_vinfo, vectype);
8029   gcc_assert (ncopies >= 1);
8030
8031   /* FORNOW. These restrictions should be relaxed.  */
8032   if (nested_in_vect_loop_p (loop, stmt_info))
8033     {
8034       imm_use_iterator imm_iter;
8035       use_operand_p use_p;
8036       gimple *exit_phi;
8037       edge latch_e;
8038       tree loop_arg;
8039
8040       if (ncopies > 1)
8041         {
8042           if (dump_enabled_p ())
8043             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8044                              "multiple types in nested loop.\n");
8045           return false;
8046         }
8047
8048       exit_phi = NULL;
8049       latch_e = loop_latch_edge (loop->inner);
8050       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
8051       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
8052         {
8053           gimple *use_stmt = USE_STMT (use_p);
8054           if (is_gimple_debug (use_stmt))
8055             continue;
8056
8057           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
8058             {
8059               exit_phi = use_stmt;
8060               break;
8061             }
8062         }
8063       if (exit_phi)
8064         {
8065           stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
8066           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
8067                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
8068             {
8069               if (dump_enabled_p ())
8070                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8071                                  "inner-loop induction only used outside "
8072                                  "of the outer vectorized loop.\n");
8073               return false;
8074             }
8075         }
8076
8077       nested_in_vect_loop = true;
8078       iv_loop = loop->inner;
8079     }
8080   else
8081     iv_loop = loop;
8082   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
8083
8084   if (slp_node && !nunits.is_constant ())
8085     {
8086       /* The current SLP code creates the step value element-by-element.  */
8087       if (dump_enabled_p ())
8088         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8089                          "SLP induction not supported for variable-length"
8090                          " vectors.\n");
8091       return false;
8092     }
8093
8094   if (!vec_stmt) /* transformation not required.  */
8095     {
8096       unsigned inside_cost = 0, prologue_cost = 0;
8097       if (slp_node)
8098         {
8099           /* We eventually need to set a vector type on invariant
8100              arguments.  */
8101           unsigned j;
8102           slp_tree child;
8103           FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
8104             if (!vect_maybe_update_slp_op_vectype
8105                 (child, SLP_TREE_VECTYPE (slp_node)))
8106               {
8107                 if (dump_enabled_p ())
8108                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8109                                    "incompatible vector types for "
8110                                    "invariants\n");
8111                 return false;
8112               }
8113           /* loop cost for vec_loop.  */
8114           inside_cost
8115             = record_stmt_cost (cost_vec,
8116                                 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
8117                                 vector_stmt, stmt_info, 0, vect_body);
8118           /* prologue cost for vec_init (if not nested) and step.  */
8119           prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
8120                                             scalar_to_vec,
8121                                             stmt_info, 0, vect_prologue);
8122         }
8123       else /* if (!slp_node) */
8124         {
8125           /* loop cost for vec_loop.  */
8126           inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
8127                                           stmt_info, 0, vect_body);
8128           /* prologue cost for vec_init and vec_step.  */
8129           prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
8130                                             stmt_info, 0, vect_prologue);
8131         }
8132       if (dump_enabled_p ())
8133         dump_printf_loc (MSG_NOTE, vect_location,
8134                          "vect_model_induction_cost: inside_cost = %d, "
8135                          "prologue_cost = %d .\n", inside_cost,
8136                          prologue_cost);
8137
8138       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
8139       DUMP_VECT_SCOPE ("vectorizable_induction");
8140       return true;
8141     }
8142
8143   /* Transform.  */
8144
8145   /* Compute a vector variable, initialized with the first VF values of
8146      the induction variable.  E.g., for an iv with IV_PHI='X' and
8147      evolution S, for a vector of 4 units, we want to compute:
8148      [X, X + S, X + 2*S, X + 3*S].  */
8149
8150   if (dump_enabled_p ())
8151     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
8152
8153   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
8154   gcc_assert (step_expr != NULL_TREE);
8155   tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
8156
8157   pe = loop_preheader_edge (iv_loop);
8158   /* Find the first insertion point in the BB.  */
8159   basic_block bb = gimple_bb (phi);
8160   si = gsi_after_labels (bb);
8161
8162   /* For SLP induction we have to generate several IVs as for example
8163      with group size 3 we need
8164        [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
8165        [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2].  */
8166   if (slp_node)
8167     {
8168       /* Enforced above.  */
8169       unsigned int const_nunits = nunits.to_constant ();
8170
8171       /* The initial values are vectorized, but any lanes > group_size
8172          need adjustment.  */
8173       slp_tree init_node
8174         = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
8175
8176       /* Gather steps.  Since we do not vectorize inductions as
8177          cycles we have to reconstruct the step from SCEV data.  */
8178       unsigned group_size = SLP_TREE_LANES (slp_node);
8179       tree *steps = XALLOCAVEC (tree, group_size);
8180       tree *inits = XALLOCAVEC (tree, group_size);
8181       stmt_vec_info phi_info;
8182       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
8183         {
8184           steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
8185           if (!init_node)
8186             inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt),
8187                                            pe->dest_idx);
8188         }
8189
8190       /* Now generate the IVs.  */
8191       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8192       gcc_assert ((const_nunits * nvects) % group_size == 0);
8193       unsigned nivs;
8194       if (nested_in_vect_loop)
8195         nivs = nvects;
8196       else
8197         {
8198           /* Compute the number of distinct IVs we need.  First reduce
8199              group_size if it is a multiple of const_nunits so we get
8200              one IV for a group_size of 4 but const_nunits 2.  */
8201           unsigned group_sizep = group_size;
8202           if (group_sizep % const_nunits == 0)
8203             group_sizep = group_sizep / const_nunits;
8204           nivs = least_common_multiple (group_sizep,
8205                                         const_nunits) / const_nunits;
8206         }
8207       tree stept = TREE_TYPE (step_vectype);
8208       tree lupdate_mul = NULL_TREE;
8209       if (!nested_in_vect_loop)
8210         {
8211           /* The number of iterations covered in one vector iteration.  */
8212           unsigned lup_mul = (nvects * const_nunits) / group_size;
8213           lupdate_mul
8214             = build_vector_from_val (step_vectype,
8215                                      SCALAR_FLOAT_TYPE_P (stept)
8216                                      ? build_real_from_wide (stept, lup_mul,
8217                                                              UNSIGNED)
8218                                      : build_int_cstu (stept, lup_mul));
8219         }
8220       tree peel_mul = NULL_TREE;
8221       gimple_seq init_stmts = NULL;
8222       if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
8223         {
8224           if (SCALAR_FLOAT_TYPE_P (stept))
8225             peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept,
8226                                      LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
8227           else
8228             peel_mul = gimple_convert (&init_stmts, stept,
8229                                        LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
8230           peel_mul = gimple_build_vector_from_val (&init_stmts,
8231                                                    step_vectype, peel_mul);
8232         }
8233       unsigned ivn;
8234       auto_vec<tree> vec_steps;
8235       for (ivn = 0; ivn < nivs; ++ivn)
8236         {
8237           tree_vector_builder step_elts (step_vectype, const_nunits, 1);
8238           tree_vector_builder init_elts (vectype, const_nunits, 1);
8239           tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
8240           for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
8241             {
8242               /* The scalar steps of the IVs.  */
8243               tree elt = steps[(ivn*const_nunits + eltn) % group_size];
8244               elt = gimple_convert (&init_stmts, TREE_TYPE (step_vectype), elt);
8245               step_elts.quick_push (elt);
8246               if (!init_node)
8247                 {
8248                   /* The scalar inits of the IVs if not vectorized.  */
8249                   elt = inits[(ivn*const_nunits + eltn) % group_size];
8250                   if (!useless_type_conversion_p (TREE_TYPE (vectype),
8251                                                   TREE_TYPE (elt)))
8252                     elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
8253                                         TREE_TYPE (vectype), elt);
8254                   init_elts.quick_push (elt);
8255                 }
8256               /* The number of steps to add to the initial values.  */
8257               unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
8258               mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
8259                                    ? build_real_from_wide (stept,
8260                                                            mul_elt, UNSIGNED)
8261                                    : build_int_cstu (stept, mul_elt));
8262             }
8263           vec_step = gimple_build_vector (&init_stmts, &step_elts);
8264           vec_steps.safe_push (vec_step);
8265           tree step_mul = gimple_build_vector (&init_stmts, &mul_elts);
8266           if (peel_mul)
8267             step_mul = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
8268                                      step_mul, peel_mul);
8269           if (!init_node)
8270             vec_init = gimple_build_vector (&init_stmts, &init_elts);
8271
8272           /* Create the induction-phi that defines the induction-operand.  */
8273           vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
8274                                             "vec_iv_");
8275           induction_phi = create_phi_node (vec_dest, iv_loop->header);
8276           induc_def = PHI_RESULT (induction_phi);
8277
8278           /* Create the iv update inside the loop  */
8279           tree up = vec_step;
8280           if (lupdate_mul)
8281             up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
8282                                vec_step, lupdate_mul);
8283           gimple_seq stmts = NULL;
8284           vec_def = gimple_convert (&stmts, step_vectype, induc_def);
8285           vec_def = gimple_build (&stmts,
8286                                   PLUS_EXPR, step_vectype, vec_def, up);
8287           vec_def = gimple_convert (&stmts, vectype, vec_def);
8288           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8289           add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
8290                        UNKNOWN_LOCATION);
8291
8292           if (init_node)
8293             vec_init = vect_get_slp_vect_def (init_node, ivn);
8294           if (!nested_in_vect_loop
8295               && !integer_zerop (step_mul))
8296             {
8297               vec_def = gimple_convert (&init_stmts, step_vectype, vec_init);
8298               up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
8299                                  vec_step, step_mul);
8300               vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
8301                                       vec_def, up);
8302               vec_init = gimple_convert (&init_stmts, vectype, vec_def);
8303             }
8304
8305           /* Set the arguments of the phi node:  */
8306           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
8307
8308           SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
8309         }
8310       if (!nested_in_vect_loop)
8311         {
8312           /* Fill up to the number of vectors we need for the whole group.  */
8313           nivs = least_common_multiple (group_size,
8314                                         const_nunits) / const_nunits;
8315           vec_steps.reserve (nivs-ivn);
8316           for (; ivn < nivs; ++ivn)
8317             {
8318               SLP_TREE_VEC_STMTS (slp_node)
8319                 .quick_push (SLP_TREE_VEC_STMTS (slp_node)[0]);
8320               vec_steps.quick_push (vec_steps[0]);
8321             }
8322         }
8323
8324       /* Re-use IVs when we can.  We are generating further vector
8325          stmts by adding VF' * stride to the IVs generated above.  */
8326       if (ivn < nvects)
8327         {
8328           unsigned vfp
8329             = least_common_multiple (group_size, const_nunits) / group_size;
8330           tree lupdate_mul
8331             = build_vector_from_val (step_vectype,
8332                                      SCALAR_FLOAT_TYPE_P (stept)
8333                                      ? build_real_from_wide (stept,
8334                                                              vfp, UNSIGNED)
8335                                      : build_int_cstu (stept, vfp));
8336           for (; ivn < nvects; ++ivn)
8337             {
8338               gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
8339               tree def = gimple_get_lhs (iv);
8340               if (ivn < 2*nivs)
8341                 vec_steps[ivn - nivs]
8342                   = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
8343                                   vec_steps[ivn - nivs], lupdate_mul);
8344               gimple_seq stmts = NULL;
8345               def = gimple_convert (&stmts, step_vectype, def);
8346               def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
8347                                   def, vec_steps[ivn % nivs]);
8348               def = gimple_convert (&stmts, vectype, def);
8349               if (gimple_code (iv) == GIMPLE_PHI)
8350                 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8351               else
8352                 {
8353                   gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
8354                   gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
8355                 }
8356               SLP_TREE_VEC_STMTS (slp_node)
8357                 .quick_push (SSA_NAME_DEF_STMT (def));
8358             }
8359         }
8360
8361       new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts);
8362       gcc_assert (!new_bb);
8363
8364       return true;
8365     }
8366
8367   init_expr = vect_phi_initial_value (phi);
8368
8369   gimple_seq stmts = NULL;
8370   if (!nested_in_vect_loop)
8371     {
8372       /* Convert the initial value to the IV update type.  */
8373       tree new_type = TREE_TYPE (step_expr);
8374       init_expr = gimple_convert (&stmts, new_type, init_expr);
8375
8376       /* If we are using the loop mask to "peel" for alignment then we need
8377          to adjust the start value here.  */
8378       tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
8379       if (skip_niters != NULL_TREE)
8380         {
8381           if (FLOAT_TYPE_P (vectype))
8382             skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
8383                                         skip_niters);
8384           else
8385             skip_niters = gimple_convert (&stmts, new_type, skip_niters);
8386           tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
8387                                          skip_niters, step_expr);
8388           init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
8389                                     init_expr, skip_step);
8390         }
8391     }
8392
8393   if (stmts)
8394     {
8395       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
8396       gcc_assert (!new_bb);
8397     }
8398
8399   /* Create the vector that holds the initial_value of the induction.  */
8400   if (nested_in_vect_loop)
8401     {
8402       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
8403          been created during vectorization of previous stmts.  We obtain it
8404          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
8405       auto_vec<tree> vec_inits;
8406       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
8407                                      init_expr, &vec_inits);
8408       vec_init = vec_inits[0];
8409       /* If the initial value is not of proper type, convert it.  */
8410       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
8411         {
8412           new_stmt
8413             = gimple_build_assign (vect_get_new_ssa_name (vectype,
8414                                                           vect_simple_var,
8415                                                           "vec_iv_"),
8416                                    VIEW_CONVERT_EXPR,
8417                                    build1 (VIEW_CONVERT_EXPR, vectype,
8418                                            vec_init));
8419           vec_init = gimple_assign_lhs (new_stmt);
8420           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
8421                                                  new_stmt);
8422           gcc_assert (!new_bb);
8423         }
8424     }
8425   else
8426     {
8427       /* iv_loop is the loop to be vectorized. Create:
8428          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
8429       stmts = NULL;
8430       new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
8431
8432       unsigned HOST_WIDE_INT const_nunits;
8433       if (nunits.is_constant (&const_nunits))
8434         {
8435           tree_vector_builder elts (step_vectype, const_nunits, 1);
8436           elts.quick_push (new_name);
8437           for (i = 1; i < const_nunits; i++)
8438             {
8439               /* Create: new_name_i = new_name + step_expr  */
8440               new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
8441                                        new_name, step_expr);
8442               elts.quick_push (new_name);
8443             }
8444           /* Create a vector from [new_name_0, new_name_1, ...,
8445              new_name_nunits-1]  */
8446           vec_init = gimple_build_vector (&stmts, &elts);
8447         }
8448       else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
8449         /* Build the initial value directly from a VEC_SERIES_EXPR.  */
8450         vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
8451                                  new_name, step_expr);
8452       else
8453         {
8454           /* Build:
8455                 [base, base, base, ...]
8456                 + (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
8457           gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
8458           gcc_assert (flag_associative_math);
8459           tree index = build_index_vector (step_vectype, 0, 1);
8460           tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
8461                                                         new_name);
8462           tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
8463                                                         step_expr);
8464           vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
8465           vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
8466                                    vec_init, step_vec);
8467           vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
8468                                    vec_init, base_vec);
8469         }
8470       vec_init = gimple_convert (&stmts, vectype, vec_init);
8471
8472       if (stmts)
8473         {
8474           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
8475           gcc_assert (!new_bb);
8476         }
8477     }
8478
8479
8480   /* Create the vector that holds the step of the induction.  */
8481   if (nested_in_vect_loop)
8482     /* iv_loop is nested in the loop to be vectorized. Generate:
8483        vec_step = [S, S, S, S]  */
8484     new_name = step_expr;
8485   else
8486     {
8487       /* iv_loop is the loop to be vectorized. Generate:
8488           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
8489       gimple_seq seq = NULL;
8490       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
8491         {
8492           expr = build_int_cst (integer_type_node, vf);
8493           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
8494         }
8495       else
8496         expr = build_int_cst (TREE_TYPE (step_expr), vf);
8497       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
8498                                expr, step_expr);
8499       if (seq)
8500         {
8501           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
8502           gcc_assert (!new_bb);
8503         }
8504     }
8505
8506   t = unshare_expr (new_name);
8507   gcc_assert (CONSTANT_CLASS_P (new_name)
8508               || TREE_CODE (new_name) == SSA_NAME);
8509   new_vec = build_vector_from_val (step_vectype, t);
8510   vec_step = vect_init_vector (loop_vinfo, stmt_info,
8511                                new_vec, step_vectype, NULL);
8512
8513
8514   /* Create the following def-use cycle:
8515      loop prolog:
8516          vec_init = ...
8517          vec_step = ...
8518      loop:
8519          vec_iv = PHI <vec_init, vec_loop>
8520          ...
8521          STMT
8522          ...
8523          vec_loop = vec_iv + vec_step;  */
8524
8525   /* Create the induction-phi that defines the induction-operand.  */
8526   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
8527   induction_phi = create_phi_node (vec_dest, iv_loop->header);
8528   induc_def = PHI_RESULT (induction_phi);
8529
8530   /* Create the iv update inside the loop  */
8531   stmts = NULL;
8532   vec_def = gimple_convert (&stmts, step_vectype, induc_def);
8533   vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
8534   vec_def = gimple_convert (&stmts, vectype, vec_def);
8535   gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8536   new_stmt = SSA_NAME_DEF_STMT (vec_def);
8537
8538   /* Set the arguments of the phi node:  */
8539   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
8540   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
8541                UNKNOWN_LOCATION);
8542
8543   STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
8544   *vec_stmt = induction_phi;
8545
8546   /* In case that vectorization factor (VF) is bigger than the number
8547      of elements that we can fit in a vectype (nunits), we have to generate
8548      more than one vector stmt - i.e - we need to "unroll" the
8549      vector stmt by a factor VF/nunits.  For more details see documentation
8550      in vectorizable_operation.  */
8551
8552   if (ncopies > 1)
8553     {
8554       gimple_seq seq = NULL;
8555       /* FORNOW. This restriction should be relaxed.  */
8556       gcc_assert (!nested_in_vect_loop);
8557
8558       /* Create the vector that holds the step of the induction.  */
8559       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
8560         {
8561           expr = build_int_cst (integer_type_node, nunits);
8562           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
8563         }
8564       else
8565         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
8566       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
8567                                expr, step_expr);
8568       if (seq)
8569         {
8570           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
8571           gcc_assert (!new_bb);
8572         }
8573
8574       t = unshare_expr (new_name);
8575       gcc_assert (CONSTANT_CLASS_P (new_name)
8576                   || TREE_CODE (new_name) == SSA_NAME);
8577       new_vec = build_vector_from_val (step_vectype, t);
8578       vec_step = vect_init_vector (loop_vinfo, stmt_info,
8579                                    new_vec, step_vectype, NULL);
8580
8581       vec_def = induc_def;
8582       for (i = 1; i < ncopies; i++)
8583         {
8584           /* vec_i = vec_prev + vec_step  */
8585           gimple_seq stmts = NULL;
8586           vec_def = gimple_convert (&stmts, step_vectype, vec_def);
8587           vec_def = gimple_build (&stmts,
8588                                   PLUS_EXPR, step_vectype, vec_def, vec_step);
8589           vec_def = gimple_convert (&stmts, vectype, vec_def);
8590
8591           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8592           new_stmt = SSA_NAME_DEF_STMT (vec_def);
8593           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
8594         }
8595     }
8596
8597   if (dump_enabled_p ())
8598     dump_printf_loc (MSG_NOTE, vect_location,
8599                      "transform induction: created def-use cycle: %G%G",
8600                      induction_phi, SSA_NAME_DEF_STMT (vec_def));
8601
8602   return true;
8603 }
8604
8605 /* Function vectorizable_live_operation.
8606
8607    STMT_INFO computes a value that is used outside the loop.  Check if
8608    it can be supported.  */
8609
8610 bool
8611 vectorizable_live_operation (vec_info *vinfo,
8612                              stmt_vec_info stmt_info,
8613                              gimple_stmt_iterator *gsi,
8614                              slp_tree slp_node, slp_instance slp_node_instance,
8615                              int slp_index, bool vec_stmt_p,
8616                              stmt_vector_for_cost *cost_vec)
8617 {
8618   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
8619   imm_use_iterator imm_iter;
8620   tree lhs, lhs_type, bitsize;
8621   tree vectype = (slp_node
8622                   ? SLP_TREE_VECTYPE (slp_node)
8623                   : STMT_VINFO_VECTYPE (stmt_info));
8624   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8625   int ncopies;
8626   gimple *use_stmt;
8627   auto_vec<tree> vec_oprnds;
8628   int vec_entry = 0;
8629   poly_uint64 vec_index = 0;
8630
8631   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
8632
8633   /* If a stmt of a reduction is live, vectorize it via
8634      vect_create_epilog_for_reduction.  vectorizable_reduction assessed
8635      validity so just trigger the transform here.  */
8636   if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
8637     {
8638       if (!vec_stmt_p)
8639         return true;
8640       if (slp_node)
8641         {
8642           /* For reduction chains the meta-info is attached to
8643              the group leader.  */
8644           if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
8645             stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
8646           /* For SLP reductions we vectorize the epilogue for
8647              all involved stmts together.  */
8648           else if (slp_index != 0)
8649             return true;
8650           else
8651             /* For SLP reductions the meta-info is attached to
8652                the representative.  */
8653             stmt_info = SLP_TREE_REPRESENTATIVE (slp_node);
8654         }
8655       stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8656       gcc_assert (reduc_info->is_reduc_info);
8657       if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
8658           || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
8659         return true;
8660       vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
8661                                         slp_node_instance);
8662       return true;
8663     }
8664
8665   /* If STMT is not relevant and it is a simple assignment and its inputs are
8666      invariant then it can remain in place, unvectorized.  The original last
8667      scalar value that it computes will be used.  */
8668   if (!STMT_VINFO_RELEVANT_P (stmt_info))
8669     {
8670       gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
8671       if (dump_enabled_p ())
8672         dump_printf_loc (MSG_NOTE, vect_location,
8673                          "statement is simple and uses invariant.  Leaving in "
8674                          "place.\n");
8675       return true;
8676     }
8677
8678   if (slp_node)
8679     ncopies = 1;
8680   else
8681     ncopies = vect_get_num_copies (loop_vinfo, vectype);
8682
8683   if (slp_node)
8684     {
8685       gcc_assert (slp_index >= 0);
8686
8687       /* Get the last occurrence of the scalar index from the concatenation of
8688          all the slp vectors. Calculate which slp vector it is and the index
8689          within.  */
8690       int num_scalar = SLP_TREE_LANES (slp_node);
8691       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8692       poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
8693
8694       /* Calculate which vector contains the result, and which lane of
8695          that vector we need.  */
8696       if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
8697         {
8698           if (dump_enabled_p ())
8699             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8700                              "Cannot determine which vector holds the"
8701                              " final result.\n");
8702           return false;
8703         }
8704     }
8705
8706   if (!vec_stmt_p)
8707     {
8708       /* No transformation required.  */
8709       if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
8710         {
8711           if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
8712                                                OPTIMIZE_FOR_SPEED))
8713             {
8714               if (dump_enabled_p ())
8715                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8716                                  "can't operate on partial vectors "
8717                                  "because the target doesn't support extract "
8718                                  "last reduction.\n");
8719               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8720             }
8721           else if (slp_node)
8722             {
8723               if (dump_enabled_p ())
8724                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8725                                  "can't operate on partial vectors "
8726                                  "because an SLP statement is live after "
8727                                  "the loop.\n");
8728               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8729             }
8730           else if (ncopies > 1)
8731             {
8732               if (dump_enabled_p ())
8733                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8734                                  "can't operate on partial vectors "
8735                                  "because ncopies is greater than 1.\n");
8736               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8737             }
8738           else
8739             {
8740               gcc_assert (ncopies == 1 && !slp_node);
8741               vect_record_loop_mask (loop_vinfo,
8742                                      &LOOP_VINFO_MASKS (loop_vinfo),
8743                                      1, vectype, NULL);
8744             }
8745         }
8746       /* ???  Enable for loop costing as well.  */
8747       if (!loop_vinfo)
8748         record_stmt_cost (cost_vec, 1, vec_to_scalar, stmt_info, NULL_TREE,
8749                           0, vect_epilogue);
8750       return true;
8751     }
8752
8753   /* Use the lhs of the original scalar statement.  */
8754   gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
8755   if (dump_enabled_p ())
8756     dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
8757                      "stmt %G", stmt);
8758
8759   lhs = gimple_get_lhs (stmt);
8760   lhs_type = TREE_TYPE (lhs);
8761
8762   bitsize = vector_element_bits_tree (vectype);
8763
8764   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
8765   tree vec_lhs, bitstart;
8766   gimple *vec_stmt;
8767   if (slp_node)
8768     {
8769       gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8770
8771       /* Get the correct slp vectorized stmt.  */
8772       vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry];
8773       vec_lhs = gimple_get_lhs (vec_stmt);
8774
8775       /* Get entry to use.  */
8776       bitstart = bitsize_int (vec_index);
8777       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
8778     }
8779   else
8780     {
8781       /* For multiple copies, get the last copy.  */
8782       vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info).last ();
8783       vec_lhs = gimple_get_lhs (vec_stmt);
8784
8785       /* Get the last lane in the vector.  */
8786       bitstart = int_const_binop (MULT_EXPR, bitsize, bitsize_int (nunits - 1));
8787     }
8788
8789   if (loop_vinfo)
8790     {
8791       /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
8792          requirement, insert one phi node for it.  It looks like:
8793            loop;
8794          BB:
8795            # lhs' = PHI <lhs>
8796          ==>
8797            loop;
8798          BB:
8799            # vec_lhs' = PHI <vec_lhs>
8800            new_tree = lane_extract <vec_lhs', ...>;
8801            lhs' = new_tree;  */
8802
8803       class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8804       basic_block exit_bb = single_exit (loop)->dest;
8805       gcc_assert (single_pred_p (exit_bb));
8806
8807       tree vec_lhs_phi = copy_ssa_name (vec_lhs);
8808       gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
8809       SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, vec_lhs);
8810
8811       gimple_seq stmts = NULL;
8812       tree new_tree;
8813       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8814         {
8815           /* Emit:
8816
8817                SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
8818
8819              where VEC_LHS is the vectorized live-out result and MASK is
8820              the loop mask for the final iteration.  */
8821           gcc_assert (ncopies == 1 && !slp_node);
8822           tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
8823           tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
8824                                           1, vectype, 0);
8825           tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
8826                                           mask, vec_lhs_phi);
8827
8828           /* Convert the extracted vector element to the scalar type.  */
8829           new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
8830         }
8831       else
8832         {
8833           tree bftype = TREE_TYPE (vectype);
8834           if (VECTOR_BOOLEAN_TYPE_P (vectype))
8835             bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8836           new_tree = build3 (BIT_FIELD_REF, bftype,
8837                              vec_lhs_phi, bitsize, bitstart);
8838           new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8839                                            &stmts, true, NULL_TREE);
8840         }
8841
8842       if (stmts)
8843         {
8844           gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb);
8845           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
8846
8847           /* Remove existing phi from lhs and create one copy from new_tree.  */
8848           tree lhs_phi = NULL_TREE;
8849           gimple_stmt_iterator gsi;
8850           for (gsi = gsi_start_phis (exit_bb);
8851                !gsi_end_p (gsi); gsi_next (&gsi))
8852             {
8853               gimple *phi = gsi_stmt (gsi);
8854               if ((gimple_phi_arg_def (phi, 0) == lhs))
8855                 {
8856                   remove_phi_node (&gsi, false);
8857                   lhs_phi = gimple_phi_result (phi);
8858                   gimple *copy = gimple_build_assign (lhs_phi, new_tree);
8859                   gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
8860                   break;
8861                 }
8862             }
8863         }
8864
8865       /* Replace use of lhs with newly computed result.  If the use stmt is a
8866          single arg PHI, just replace all uses of PHI result.  It's necessary
8867          because lcssa PHI defining lhs may be before newly inserted stmt.  */
8868       use_operand_p use_p;
8869       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8870         if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
8871             && !is_gimple_debug (use_stmt))
8872           {
8873             if (gimple_code (use_stmt) == GIMPLE_PHI
8874                 && gimple_phi_num_args (use_stmt) == 1)
8875               {
8876                 replace_uses_by (gimple_phi_result (use_stmt), new_tree);
8877               }
8878             else
8879               {
8880                 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8881                     SET_USE (use_p, new_tree);
8882               }
8883             update_stmt (use_stmt);
8884           }
8885     }
8886   else
8887     {
8888       /* For basic-block vectorization simply insert the lane-extraction.  */
8889       tree bftype = TREE_TYPE (vectype);
8890       if (VECTOR_BOOLEAN_TYPE_P (vectype))
8891         bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8892       tree new_tree = build3 (BIT_FIELD_REF, bftype,
8893                               vec_lhs, bitsize, bitstart);
8894       gimple_seq stmts = NULL;
8895       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8896                                        &stmts, true, NULL_TREE);
8897       if (TREE_CODE (new_tree) == SSA_NAME
8898           && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
8899         SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1;
8900       if (is_a <gphi *> (vec_stmt))
8901         {
8902           gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt));
8903           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8904         }
8905       else
8906         {
8907           gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
8908           gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
8909         }
8910
8911       /* Replace use of lhs with newly computed result.  If the use stmt is a
8912          single arg PHI, just replace all uses of PHI result.  It's necessary
8913          because lcssa PHI defining lhs may be before newly inserted stmt.  */
8914       use_operand_p use_p;
8915       stmt_vec_info use_stmt_info;
8916       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8917         if (!is_gimple_debug (use_stmt)
8918             && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
8919                 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
8920           {
8921             /* ???  This can happen when the live lane ends up being
8922                used in a vector construction code-generated by an
8923                external SLP node (and code-generation for that already
8924                happened).  See gcc.dg/vect/bb-slp-47.c.
8925                Doing this is what would happen if that vector CTOR
8926                were not code-generated yet so it is not too bad.
8927                ???  In fact we'd likely want to avoid this situation
8928                in the first place.  */
8929             if (TREE_CODE (new_tree) == SSA_NAME
8930                 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
8931                 && gimple_code (use_stmt) != GIMPLE_PHI
8932                 && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
8933                                                 use_stmt))
8934               {
8935                 enum tree_code code = gimple_assign_rhs_code (use_stmt);
8936                 gcc_assert (code == CONSTRUCTOR
8937                             || code == VIEW_CONVERT_EXPR
8938                             || CONVERT_EXPR_CODE_P (code));
8939                 if (dump_enabled_p ())
8940                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8941                                    "Using original scalar computation for "
8942                                    "live lane because use preceeds vector "
8943                                    "def\n");
8944                 continue;
8945               }
8946             /* ???  It can also happen that we end up pulling a def into
8947                a loop where replacing out-of-loop uses would require
8948                a new LC SSA PHI node.  Retain the original scalar in
8949                those cases as well.  PR98064.  */
8950             if (TREE_CODE (new_tree) == SSA_NAME
8951                 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
8952                 && (gimple_bb (use_stmt)->loop_father
8953                     != gimple_bb (vec_stmt)->loop_father)
8954                 && !flow_loop_nested_p (gimple_bb (vec_stmt)->loop_father,
8955                                         gimple_bb (use_stmt)->loop_father))
8956               {
8957                 if (dump_enabled_p ())
8958                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8959                                    "Using original scalar computation for "
8960                                    "live lane because there is an out-of-loop "
8961                                    "definition for it\n");
8962                 continue;
8963               }
8964             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8965               SET_USE (use_p, new_tree);
8966             update_stmt (use_stmt);
8967           }
8968     }
8969
8970   return true;
8971 }
8972
8973 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO.  */
8974
8975 static void
8976 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
8977 {
8978   ssa_op_iter op_iter;
8979   imm_use_iterator imm_iter;
8980   def_operand_p def_p;
8981   gimple *ustmt;
8982
8983   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
8984     {
8985       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
8986         {
8987           basic_block bb;
8988
8989           if (!is_gimple_debug (ustmt))
8990             continue;
8991
8992           bb = gimple_bb (ustmt);
8993
8994           if (!flow_bb_inside_loop_p (loop, bb))
8995             {
8996               if (gimple_debug_bind_p (ustmt))
8997                 {
8998                   if (dump_enabled_p ())
8999                     dump_printf_loc (MSG_NOTE, vect_location,
9000                                      "killing debug use\n");
9001
9002                   gimple_debug_bind_reset_value (ustmt);
9003                   update_stmt (ustmt);
9004                 }
9005               else
9006                 gcc_unreachable ();
9007             }
9008         }
9009     }
9010 }
9011
9012 /* Given loop represented by LOOP_VINFO, return true if computation of
9013    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
9014    otherwise.  */
9015
9016 static bool
9017 loop_niters_no_overflow (loop_vec_info loop_vinfo)
9018 {
9019   /* Constant case.  */
9020   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
9021     {
9022       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
9023       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
9024
9025       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
9026       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
9027       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
9028         return true;
9029     }
9030
9031   widest_int max;
9032   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9033   /* Check the upper bound of loop niters.  */
9034   if (get_max_loop_iterations (loop, &max))
9035     {
9036       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
9037       signop sgn = TYPE_SIGN (type);
9038       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
9039       if (max < type_max)
9040         return true;
9041     }
9042   return false;
9043 }
9044
9045 /* Return a mask type with half the number of elements as OLD_TYPE,
9046    given that it should have mode NEW_MODE.  */
9047
9048 tree
9049 vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
9050 {
9051   poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
9052   return build_truth_vector_type_for_mode (nunits, new_mode);
9053 }
9054
9055 /* Return a mask type with twice as many elements as OLD_TYPE,
9056    given that it should have mode NEW_MODE.  */
9057
9058 tree
9059 vect_double_mask_nunits (tree old_type, machine_mode new_mode)
9060 {
9061   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
9062   return build_truth_vector_type_for_mode (nunits, new_mode);
9063 }
9064
9065 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
9066    contain a sequence of NVECTORS masks that each control a vector of type
9067    VECTYPE.  If SCALAR_MASK is nonnull, the fully-masked loop would AND
9068    these vector masks with the vector version of SCALAR_MASK.  */
9069
9070 void
9071 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
9072                        unsigned int nvectors, tree vectype, tree scalar_mask)
9073 {
9074   gcc_assert (nvectors != 0);
9075   if (masks->length () < nvectors)
9076     masks->safe_grow_cleared (nvectors, true);
9077   rgroup_controls *rgm = &(*masks)[nvectors - 1];
9078   /* The number of scalars per iteration and the number of vectors are
9079      both compile-time constants.  */
9080   unsigned int nscalars_per_iter
9081     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
9082                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
9083
9084   if (scalar_mask)
9085     {
9086       scalar_cond_masked_key cond (scalar_mask, nvectors);
9087       loop_vinfo->scalar_cond_masked_set.add (cond);
9088     }
9089
9090   if (rgm->max_nscalars_per_iter < nscalars_per_iter)
9091     {
9092       rgm->max_nscalars_per_iter = nscalars_per_iter;
9093       rgm->type = truth_type_for (vectype);
9094       rgm->factor = 1;
9095     }
9096 }
9097
9098 /* Given a complete set of masks MASKS, extract mask number INDEX
9099    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
9100    where 0 <= INDEX < NVECTORS.  Insert any set-up statements before GSI.
9101
9102    See the comment above vec_loop_masks for more details about the mask
9103    arrangement.  */
9104
9105 tree
9106 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
9107                     unsigned int nvectors, tree vectype, unsigned int index)
9108 {
9109   rgroup_controls *rgm = &(*masks)[nvectors - 1];
9110   tree mask_type = rgm->type;
9111
9112   /* Populate the rgroup's mask array, if this is the first time we've
9113      used it.  */
9114   if (rgm->controls.is_empty ())
9115     {
9116       rgm->controls.safe_grow_cleared (nvectors, true);
9117       for (unsigned int i = 0; i < nvectors; ++i)
9118         {
9119           tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
9120           /* Provide a dummy definition until the real one is available.  */
9121           SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
9122           rgm->controls[i] = mask;
9123         }
9124     }
9125
9126   tree mask = rgm->controls[index];
9127   if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
9128                 TYPE_VECTOR_SUBPARTS (vectype)))
9129     {
9130       /* A loop mask for data type X can be reused for data type Y
9131          if X has N times more elements than Y and if Y's elements
9132          are N times bigger than X's.  In this case each sequence
9133          of N elements in the loop mask will be all-zero or all-one.
9134          We can then view-convert the mask so that each sequence of
9135          N elements is replaced by a single element.  */
9136       gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
9137                               TYPE_VECTOR_SUBPARTS (vectype)));
9138       gimple_seq seq = NULL;
9139       mask_type = truth_type_for (vectype);
9140       mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
9141       if (seq)
9142         gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
9143     }
9144   return mask;
9145 }
9146
9147 /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
9148    lengths for controlling an operation on VECTYPE.  The operation splits
9149    each element of VECTYPE into FACTOR separate subelements, measuring the
9150    length as a number of these subelements.  */
9151
9152 void
9153 vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
9154                       unsigned int nvectors, tree vectype, unsigned int factor)
9155 {
9156   gcc_assert (nvectors != 0);
9157   if (lens->length () < nvectors)
9158     lens->safe_grow_cleared (nvectors, true);
9159   rgroup_controls *rgl = &(*lens)[nvectors - 1];
9160
9161   /* The number of scalars per iteration, scalar occupied bytes and
9162      the number of vectors are both compile-time constants.  */
9163   unsigned int nscalars_per_iter
9164     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
9165                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
9166
9167   if (rgl->max_nscalars_per_iter < nscalars_per_iter)
9168     {
9169       /* For now, we only support cases in which all loads and stores fall back
9170          to VnQI or none do.  */
9171       gcc_assert (!rgl->max_nscalars_per_iter
9172                   || (rgl->factor == 1 && factor == 1)
9173                   || (rgl->max_nscalars_per_iter * rgl->factor
9174                       == nscalars_per_iter * factor));
9175       rgl->max_nscalars_per_iter = nscalars_per_iter;
9176       rgl->type = vectype;
9177       rgl->factor = factor;
9178     }
9179 }
9180
9181 /* Given a complete set of length LENS, extract length number INDEX for an
9182    rgroup that operates on NVECTORS vectors, where 0 <= INDEX < NVECTORS.  */
9183
9184 tree
9185 vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
9186                    unsigned int nvectors, unsigned int index)
9187 {
9188   rgroup_controls *rgl = &(*lens)[nvectors - 1];
9189
9190   /* Populate the rgroup's len array, if this is the first time we've
9191      used it.  */
9192   if (rgl->controls.is_empty ())
9193     {
9194       rgl->controls.safe_grow_cleared (nvectors, true);
9195       for (unsigned int i = 0; i < nvectors; ++i)
9196         {
9197           tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
9198           gcc_assert (len_type != NULL_TREE);
9199           tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
9200
9201           /* Provide a dummy definition until the real one is available.  */
9202           SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
9203           rgl->controls[i] = len;
9204         }
9205     }
9206
9207   return rgl->controls[index];
9208 }
9209
9210 /* Scale profiling counters by estimation for LOOP which is vectorized
9211    by factor VF.  */
9212
9213 static void
9214 scale_profile_for_vect_loop (class loop *loop, unsigned vf)
9215 {
9216   edge preheader = loop_preheader_edge (loop);
9217   /* Reduce loop iterations by the vectorization factor.  */
9218   gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
9219   profile_count freq_h = loop->header->count, freq_e = preheader->count ();
9220
9221   if (freq_h.nonzero_p ())
9222     {
9223       profile_probability p;
9224
9225       /* Avoid dropping loop body profile counter to 0 because of zero count
9226          in loop's preheader.  */
9227       if (!(freq_e == profile_count::zero ()))
9228         freq_e = freq_e.force_nonzero ();
9229       p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
9230       scale_loop_frequencies (loop, p);
9231     }
9232
9233   edge exit_e = single_exit (loop);
9234   exit_e->probability = profile_probability::always ()
9235                                  .apply_scale (1, new_est_niter + 1);
9236
9237   edge exit_l = single_pred_edge (loop->latch);
9238   profile_probability prob = exit_l->probability;
9239   exit_l->probability = exit_e->probability.invert ();
9240   if (prob.initialized_p () && exit_l->probability.initialized_p ())
9241     scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
9242 }
9243
9244 /* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
9245    latch edge values originally defined by it.  */
9246
9247 static void
9248 maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo,
9249                                      stmt_vec_info def_stmt_info)
9250 {
9251   tree def = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
9252   if (!def || TREE_CODE (def) != SSA_NAME)
9253     return;
9254   stmt_vec_info phi_info;
9255   imm_use_iterator iter;
9256   use_operand_p use_p;
9257   FOR_EACH_IMM_USE_FAST (use_p, iter, def)
9258     if (gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p)))
9259       if (gimple_bb (phi)->loop_father->header == gimple_bb (phi)
9260           && (phi_info = loop_vinfo->lookup_stmt (phi))
9261           && STMT_VINFO_RELEVANT_P (phi_info)
9262           && VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info))
9263           && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION
9264           && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION)
9265         {
9266           loop_p loop = gimple_bb (phi)->loop_father;
9267           edge e = loop_latch_edge (loop);
9268           if (PHI_ARG_DEF_FROM_EDGE (phi, e) == def)
9269             {
9270               vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
9271               vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
9272               gcc_assert (phi_defs.length () == latch_defs.length ());
9273               for (unsigned i = 0; i < phi_defs.length (); ++i)
9274                 add_phi_arg (as_a <gphi *> (phi_defs[i]),
9275                              gimple_get_lhs (latch_defs[i]), e,
9276                              gimple_phi_arg_location (phi, e->dest_idx));
9277             }
9278         }
9279 }
9280
9281 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
9282    When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
9283    stmt_vec_info.  */
9284
9285 static bool
9286 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
9287                           gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
9288 {
9289   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9290   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9291
9292   if (dump_enabled_p ())
9293     dump_printf_loc (MSG_NOTE, vect_location,
9294                      "------>vectorizing statement: %G", stmt_info->stmt);
9295
9296   if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
9297     vect_loop_kill_debug_uses (loop, stmt_info);
9298
9299   if (!STMT_VINFO_RELEVANT_P (stmt_info)
9300       && !STMT_VINFO_LIVE_P (stmt_info))
9301     return false;
9302
9303   if (STMT_VINFO_VECTYPE (stmt_info))
9304     {
9305       poly_uint64 nunits
9306         = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
9307       if (!STMT_SLP_TYPE (stmt_info)
9308           && maybe_ne (nunits, vf)
9309           && dump_enabled_p ())
9310         /* For SLP VF is set according to unrolling factor, and not
9311            to vector size, hence for SLP this print is not valid.  */
9312         dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
9313     }
9314
9315   /* Pure SLP statements have already been vectorized.  We still need
9316      to apply loop vectorization to hybrid SLP statements.  */
9317   if (PURE_SLP_STMT (stmt_info))
9318     return false;
9319
9320   if (dump_enabled_p ())
9321     dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
9322
9323   if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL))
9324     *seen_store = stmt_info;
9325
9326   return true;
9327 }
9328
9329 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
9330    in the hash_map with its corresponding values.  */
9331
9332 static tree
9333 find_in_mapping (tree t, void *context)
9334 {
9335   hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
9336
9337   tree *value = mapping->get (t);
9338   return value ? *value : t;
9339 }
9340
9341 /* Update EPILOGUE's loop_vec_info.  EPILOGUE was constructed as a copy of the
9342    original loop that has now been vectorized.
9343
9344    The inits of the data_references need to be advanced with the number of
9345    iterations of the main loop.  This has been computed in vect_do_peeling and
9346    is stored in parameter ADVANCE.  We first restore the data_references
9347    initial offset with the values recored in ORIG_DRS_INIT.
9348
9349    Since the loop_vec_info of this EPILOGUE was constructed for the original
9350    loop, its stmt_vec_infos all point to the original statements.  These need
9351    to be updated to point to their corresponding copies as well as the SSA_NAMES
9352    in their PATTERN_DEF_SEQs and RELATED_STMTs.
9353
9354    The data_reference's connections also need to be updated.  Their
9355    corresponding dr_vec_info need to be reconnected to the EPILOGUE's
9356    stmt_vec_infos, their statements need to point to their corresponding copy,
9357    if they are gather loads or scatter stores then their reference needs to be
9358    updated to point to its corresponding copy and finally we set
9359    'base_misaligned' to false as we have already peeled for alignment in the
9360    prologue of the main loop.  */
9361
9362 static void
9363 update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
9364 {
9365   loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
9366   auto_vec<gimple *> stmt_worklist;
9367   hash_map<tree,tree> mapping;
9368   gimple *orig_stmt, *new_stmt;
9369   gimple_stmt_iterator epilogue_gsi;
9370   gphi_iterator epilogue_phi_gsi;
9371   stmt_vec_info stmt_vinfo = NULL, related_vinfo;
9372   basic_block *epilogue_bbs = get_loop_body (epilogue);
9373   unsigned i;
9374
9375   free (LOOP_VINFO_BBS (epilogue_vinfo));
9376   LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
9377
9378   /* Advance data_reference's with the number of iterations of the previous
9379      loop and its prologue.  */
9380   vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
9381
9382
9383   /* The EPILOGUE loop is a copy of the original loop so they share the same
9384      gimple UIDs.  In this loop we update the loop_vec_info of the EPILOGUE to
9385      point to the copied statements.  We also create a mapping of all LHS' in
9386      the original loop and all the LHS' in the EPILOGUE and create worklists to
9387      update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs.  */
9388   for (unsigned i = 0; i < epilogue->num_nodes; ++i)
9389     {
9390       for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
9391            !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
9392         {
9393           new_stmt = epilogue_phi_gsi.phi ();
9394
9395           gcc_assert (gimple_uid (new_stmt) > 0);
9396           stmt_vinfo
9397             = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
9398
9399           orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
9400           STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
9401
9402           mapping.put (gimple_phi_result (orig_stmt),
9403                        gimple_phi_result (new_stmt));
9404           /* PHI nodes can not have patterns or related statements.  */
9405           gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
9406                       && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
9407         }
9408
9409       for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
9410            !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
9411         {
9412           new_stmt = gsi_stmt (epilogue_gsi);
9413           if (is_gimple_debug (new_stmt))
9414             continue;
9415
9416           gcc_assert (gimple_uid (new_stmt) > 0);
9417           stmt_vinfo
9418             = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
9419
9420           orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
9421           STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
9422
9423           if (tree old_lhs = gimple_get_lhs (orig_stmt))
9424             mapping.put (old_lhs, gimple_get_lhs (new_stmt));
9425
9426           if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
9427             {
9428               gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
9429               for (gimple_stmt_iterator gsi = gsi_start (seq);
9430                    !gsi_end_p (gsi); gsi_next (&gsi))
9431                 stmt_worklist.safe_push (gsi_stmt (gsi));
9432             }
9433
9434           related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
9435           if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
9436             {
9437               gimple *stmt = STMT_VINFO_STMT (related_vinfo);
9438               stmt_worklist.safe_push (stmt);
9439               /* Set BB such that the assert in
9440                 'get_initial_def_for_reduction' is able to determine that
9441                 the BB of the related stmt is inside this loop.  */
9442               gimple_set_bb (stmt,
9443                              gimple_bb (new_stmt));
9444               related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
9445               gcc_assert (related_vinfo == NULL
9446                           || related_vinfo == stmt_vinfo);
9447             }
9448         }
9449     }
9450
9451   /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
9452      using the original main loop and thus need to be updated to refer to the
9453      cloned variables used in the epilogue.  */
9454   for (unsigned i = 0; i < stmt_worklist.length (); ++i)
9455     {
9456       gimple *stmt = stmt_worklist[i];
9457       tree *new_op;
9458
9459       for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
9460         {
9461           tree op = gimple_op (stmt, j);
9462           if ((new_op = mapping.get(op)))
9463             gimple_set_op (stmt, j, *new_op);
9464           else
9465             {
9466               /* PR92429: The last argument of simplify_replace_tree disables
9467                  folding when replacing arguments.  This is required as
9468                  otherwise you might end up with different statements than the
9469                  ones analyzed in vect_loop_analyze, leading to different
9470                  vectorization.  */
9471               op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
9472                                           &find_in_mapping, &mapping, false);
9473               gimple_set_op (stmt, j, op);
9474             }
9475         }
9476     }
9477
9478   struct data_reference *dr;
9479   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
9480   FOR_EACH_VEC_ELT (datarefs, i, dr)
9481     {
9482       orig_stmt = DR_STMT (dr);
9483       gcc_assert (gimple_uid (orig_stmt) > 0);
9484       stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
9485       /* Data references for gather loads and scatter stores do not use the
9486          updated offset we set using ADVANCE.  Instead we have to make sure the
9487          reference in the data references point to the corresponding copy of
9488          the original in the epilogue.  */
9489       if (STMT_VINFO_MEMORY_ACCESS_TYPE (vect_stmt_to_vectorize (stmt_vinfo))
9490           == VMAT_GATHER_SCATTER)
9491         {
9492           DR_REF (dr)
9493             = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
9494                                      &find_in_mapping, &mapping);
9495           DR_BASE_ADDRESS (dr)
9496             = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
9497                                      &find_in_mapping, &mapping);
9498         }
9499       DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
9500       stmt_vinfo->dr_aux.stmt = stmt_vinfo;
9501       /* The vector size of the epilogue is smaller than that of the main loop
9502          so the alignment is either the same or lower. This means the dr will
9503          thus by definition be aligned.  */
9504       STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false;
9505     }
9506
9507   epilogue_vinfo->shared->datarefs_copy.release ();
9508   epilogue_vinfo->shared->save_datarefs ();
9509 }
9510
9511 /* Function vect_transform_loop.
9512
9513    The analysis phase has determined that the loop is vectorizable.
9514    Vectorize the loop - created vectorized stmts to replace the scalar
9515    stmts in the loop, and update the loop exit condition.
9516    Returns scalar epilogue loop if any.  */
9517
9518 class loop *
9519 vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
9520 {
9521   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9522   class loop *epilogue = NULL;
9523   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
9524   int nbbs = loop->num_nodes;
9525   int i;
9526   tree niters_vector = NULL_TREE;
9527   tree step_vector = NULL_TREE;
9528   tree niters_vector_mult_vf = NULL_TREE;
9529   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9530   unsigned int lowest_vf = constant_lower_bound (vf);
9531   gimple *stmt;
9532   bool check_profitability = false;
9533   unsigned int th;
9534
9535   DUMP_VECT_SCOPE ("vec_transform_loop");
9536
9537   loop_vinfo->shared->check_datarefs ();
9538
9539   /* Use the more conservative vectorization threshold.  If the number
9540      of iterations is constant assume the cost check has been performed
9541      by our caller.  If the threshold makes all loops profitable that
9542      run at least the (estimated) vectorization factor number of times
9543      checking is pointless, too.  */
9544   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
9545   if (vect_apply_runtime_profitability_check_p (loop_vinfo))
9546     {
9547       if (dump_enabled_p ())
9548         dump_printf_loc (MSG_NOTE, vect_location,
9549                          "Profitability threshold is %d loop iterations.\n",
9550                          th);
9551       check_profitability = true;
9552     }
9553
9554   /* Make sure there exists a single-predecessor exit bb.  Do this before
9555      versioning.   */
9556   edge e = single_exit (loop);
9557   if (! single_pred_p (e->dest))
9558     {
9559       split_loop_exit_edge (e, true);
9560       if (dump_enabled_p ())
9561         dump_printf (MSG_NOTE, "split exit edge\n");
9562     }
9563
9564   /* Version the loop first, if required, so the profitability check
9565      comes first.  */
9566
9567   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
9568     {
9569       class loop *sloop
9570         = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
9571       sloop->force_vectorize = false;
9572       check_profitability = false;
9573     }
9574
9575   /* Make sure there exists a single-predecessor exit bb also on the
9576      scalar loop copy.  Do this after versioning but before peeling
9577      so CFG structure is fine for both scalar and if-converted loop
9578      to make slpeel_duplicate_current_defs_from_edges face matched
9579      loop closed PHI nodes on the exit.  */
9580   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
9581     {
9582       e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
9583       if (! single_pred_p (e->dest))
9584         {
9585           split_loop_exit_edge (e, true);
9586           if (dump_enabled_p ())
9587             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
9588         }
9589     }
9590
9591   tree niters = vect_build_loop_niters (loop_vinfo);
9592   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
9593   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
9594   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
9595   tree advance;
9596   drs_init_vec orig_drs_init;
9597
9598   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
9599                               &step_vector, &niters_vector_mult_vf, th,
9600                               check_profitability, niters_no_overflow,
9601                               &advance);
9602
9603   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
9604       && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
9605     scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
9606                             LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
9607
9608   if (niters_vector == NULL_TREE)
9609     {
9610       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
9611           && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
9612           && known_eq (lowest_vf, vf))
9613         {
9614           niters_vector
9615             = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
9616                              LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
9617           step_vector = build_one_cst (TREE_TYPE (niters));
9618         }
9619       else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
9620         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
9621                                      &step_vector, niters_no_overflow);
9622       else
9623         /* vect_do_peeling subtracted the number of peeled prologue
9624            iterations from LOOP_VINFO_NITERS.  */
9625         vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
9626                                      &niters_vector, &step_vector,
9627                                      niters_no_overflow);
9628     }
9629
9630   /* 1) Make sure the loop header has exactly two entries
9631      2) Make sure we have a preheader basic block.  */
9632
9633   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
9634
9635   split_edge (loop_preheader_edge (loop));
9636
9637   if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
9638     /* This will deal with any possible peeling.  */
9639     vect_prepare_for_masked_peels (loop_vinfo);
9640
9641   /* Schedule the SLP instances first, then handle loop vectorization
9642      below.  */
9643   if (!loop_vinfo->slp_instances.is_empty ())
9644     {
9645       DUMP_VECT_SCOPE ("scheduling SLP instances");
9646       vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
9647     }
9648
9649   /* FORNOW: the vectorizer supports only loops which body consist
9650      of one basic block (header + empty latch). When the vectorizer will
9651      support more involved loop forms, the order by which the BBs are
9652      traversed need to be reconsidered.  */
9653
9654   for (i = 0; i < nbbs; i++)
9655     {
9656       basic_block bb = bbs[i];
9657       stmt_vec_info stmt_info;
9658
9659       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
9660            gsi_next (&si))
9661         {
9662           gphi *phi = si.phi ();
9663           if (dump_enabled_p ())
9664             dump_printf_loc (MSG_NOTE, vect_location,
9665                              "------>vectorizing phi: %G", phi);
9666           stmt_info = loop_vinfo->lookup_stmt (phi);
9667           if (!stmt_info)
9668             continue;
9669
9670           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
9671             vect_loop_kill_debug_uses (loop, stmt_info);
9672
9673           if (!STMT_VINFO_RELEVANT_P (stmt_info)
9674               && !STMT_VINFO_LIVE_P (stmt_info))
9675             continue;
9676
9677           if (STMT_VINFO_VECTYPE (stmt_info)
9678               && (maybe_ne
9679                   (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
9680               && dump_enabled_p ())
9681             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
9682
9683           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
9684                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
9685                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
9686                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
9687                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
9688               && ! PURE_SLP_STMT (stmt_info))
9689             {
9690               if (dump_enabled_p ())
9691                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
9692               vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL);
9693             }
9694         }
9695
9696       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
9697            gsi_next (&si))
9698         {
9699           gphi *phi = si.phi ();
9700           stmt_info = loop_vinfo->lookup_stmt (phi);
9701           if (!stmt_info)
9702             continue;
9703
9704           if (!STMT_VINFO_RELEVANT_P (stmt_info)
9705               && !STMT_VINFO_LIVE_P (stmt_info))
9706             continue;
9707
9708           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
9709                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
9710                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
9711                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
9712                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
9713               && ! PURE_SLP_STMT (stmt_info))
9714             maybe_set_vectorized_backedge_value (loop_vinfo, stmt_info);
9715         }
9716
9717       for (gimple_stmt_iterator si = gsi_start_bb (bb);
9718            !gsi_end_p (si);)
9719         {
9720           stmt = gsi_stmt (si);
9721           /* During vectorization remove existing clobber stmts.  */
9722           if (gimple_clobber_p (stmt))
9723             {
9724               unlink_stmt_vdef (stmt);
9725               gsi_remove (&si, true);
9726               release_defs (stmt);
9727             }
9728           else
9729             {
9730               /* Ignore vector stmts created in the outer loop.  */
9731               stmt_info = loop_vinfo->lookup_stmt (stmt);
9732
9733               /* vector stmts created in the outer-loop during vectorization of
9734                  stmts in an inner-loop may not have a stmt_info, and do not
9735                  need to be vectorized.  */
9736               stmt_vec_info seen_store = NULL;
9737               if (stmt_info)
9738                 {
9739                   if (STMT_VINFO_IN_PATTERN_P (stmt_info))
9740                     {
9741                       gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
9742                       for (gimple_stmt_iterator subsi = gsi_start (def_seq);
9743                            !gsi_end_p (subsi); gsi_next (&subsi))
9744                         {
9745                           stmt_vec_info pat_stmt_info
9746                             = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
9747                           vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
9748                                                     &si, &seen_store);
9749                         }
9750                       stmt_vec_info pat_stmt_info
9751                         = STMT_VINFO_RELATED_STMT (stmt_info);
9752                       if (vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
9753                                                     &si, &seen_store))
9754                         maybe_set_vectorized_backedge_value (loop_vinfo,
9755                                                              pat_stmt_info);
9756                     }
9757                   else
9758                     {
9759                       if (vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
9760                                                     &seen_store))
9761                         maybe_set_vectorized_backedge_value (loop_vinfo,
9762                                                              stmt_info);
9763                     }
9764                 }
9765               gsi_next (&si);
9766               if (seen_store)
9767                 {
9768                   if (STMT_VINFO_GROUPED_ACCESS (seen_store))
9769                     /* Interleaving.  If IS_STORE is TRUE, the
9770                        vectorization of the interleaving chain was
9771                        completed - free all the stores in the chain.  */
9772                     vect_remove_stores (loop_vinfo,
9773                                         DR_GROUP_FIRST_ELEMENT (seen_store));
9774                   else
9775                     /* Free the attached stmt_vec_info and remove the stmt.  */
9776                     loop_vinfo->remove_stmt (stmt_info);
9777                 }
9778             }
9779         }
9780
9781       /* Stub out scalar statements that must not survive vectorization.
9782          Doing this here helps with grouped statements, or statements that
9783          are involved in patterns.  */
9784       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
9785            !gsi_end_p (gsi); gsi_next (&gsi))
9786         {
9787           gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
9788           if (!call || !gimple_call_internal_p (call))
9789             continue;
9790           internal_fn ifn = gimple_call_internal_fn (call);
9791           if (ifn == IFN_MASK_LOAD)
9792             {
9793               tree lhs = gimple_get_lhs (call);
9794               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9795                 {
9796                   tree zero = build_zero_cst (TREE_TYPE (lhs));
9797                   gimple *new_stmt = gimple_build_assign (lhs, zero);
9798                   gsi_replace (&gsi, new_stmt, true);
9799                 }
9800             }
9801           else if (conditional_internal_fn_code (ifn) != ERROR_MARK)
9802             {
9803               tree lhs = gimple_get_lhs (call);
9804               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9805                 {
9806                   tree else_arg
9807                     = gimple_call_arg (call, gimple_call_num_args (call) - 1);
9808                   gimple *new_stmt = gimple_build_assign (lhs, else_arg);
9809                   gsi_replace (&gsi, new_stmt, true);
9810                 }
9811             }
9812         }
9813     }                           /* BBs in loop */
9814
9815   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
9816      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
9817   if (integer_onep (step_vector))
9818     niters_no_overflow = true;
9819   vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
9820                            niters_vector_mult_vf, !niters_no_overflow);
9821
9822   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
9823   scale_profile_for_vect_loop (loop, assumed_vf);
9824
9825   /* True if the final iteration might not handle a full vector's
9826      worth of scalar iterations.  */
9827   bool final_iter_may_be_partial
9828     = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo);
9829   /* The minimum number of iterations performed by the epilogue.  This
9830      is 1 when peeling for gaps because we always need a final scalar
9831      iteration.  */
9832   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
9833   /* +1 to convert latch counts to loop iteration counts,
9834      -min_epilogue_iters to remove iterations that cannot be performed
9835        by the vector code.  */
9836   int bias_for_lowest = 1 - min_epilogue_iters;
9837   int bias_for_assumed = bias_for_lowest;
9838   int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
9839   if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
9840     {
9841       /* When the amount of peeling is known at compile time, the first
9842          iteration will have exactly alignment_npeels active elements.
9843          In the worst case it will have at least one.  */
9844       int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
9845       bias_for_lowest += lowest_vf - min_first_active;
9846       bias_for_assumed += assumed_vf - min_first_active;
9847     }
9848   /* In these calculations the "- 1" converts loop iteration counts
9849      back to latch counts.  */
9850   if (loop->any_upper_bound)
9851     {
9852       loop_vec_info main_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
9853       loop->nb_iterations_upper_bound
9854         = (final_iter_may_be_partial
9855            ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
9856                             lowest_vf) - 1
9857            : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
9858                              lowest_vf) - 1);
9859       if (main_vinfo)
9860         {
9861           unsigned int bound;
9862           poly_uint64 main_iters
9863             = upper_bound (LOOP_VINFO_VECT_FACTOR (main_vinfo),
9864                            LOOP_VINFO_COST_MODEL_THRESHOLD (main_vinfo));
9865           main_iters
9866             = upper_bound (main_iters,
9867                            LOOP_VINFO_VERSIONING_THRESHOLD (main_vinfo));
9868           if (can_div_away_from_zero_p (main_iters,
9869                                         LOOP_VINFO_VECT_FACTOR (loop_vinfo),
9870                                         &bound))
9871             loop->nb_iterations_upper_bound
9872               = wi::umin ((widest_int) (bound - 1),
9873                           loop->nb_iterations_upper_bound);
9874       }
9875   }
9876   if (loop->any_likely_upper_bound)
9877     loop->nb_iterations_likely_upper_bound
9878       = (final_iter_may_be_partial
9879          ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
9880                           + bias_for_lowest, lowest_vf) - 1
9881          : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
9882                            + bias_for_lowest, lowest_vf) - 1);
9883   if (loop->any_estimate)
9884     loop->nb_iterations_estimate
9885       = (final_iter_may_be_partial
9886          ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
9887                           assumed_vf) - 1
9888          : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
9889                            assumed_vf) - 1);
9890
9891   if (dump_enabled_p ())
9892     {
9893       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
9894         {
9895           dump_printf_loc (MSG_NOTE, vect_location,
9896                            "LOOP VECTORIZED\n");
9897           if (loop->inner)
9898             dump_printf_loc (MSG_NOTE, vect_location,
9899                              "OUTER LOOP VECTORIZED\n");
9900           dump_printf (MSG_NOTE, "\n");
9901         }
9902       else
9903         dump_printf_loc (MSG_NOTE, vect_location,
9904                          "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
9905                          GET_MODE_NAME (loop_vinfo->vector_mode));
9906     }
9907
9908   /* Loops vectorized with a variable factor won't benefit from
9909      unrolling/peeling.  */
9910   if (!vf.is_constant ())
9911     {
9912       loop->unroll = 1;
9913       if (dump_enabled_p ())
9914         dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
9915                          " variable-length vectorization factor\n");
9916     }
9917   /* Free SLP instances here because otherwise stmt reference counting
9918      won't work.  */
9919   slp_instance instance;
9920   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
9921     vect_free_slp_instance (instance);
9922   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
9923   /* Clear-up safelen field since its value is invalid after vectorization
9924      since vectorized loop can have loop-carried dependencies.  */
9925   loop->safelen = 0;
9926
9927   if (epilogue)
9928     {
9929       update_epilogue_loop_vinfo (epilogue, advance);
9930
9931       epilogue->simduid = loop->simduid;
9932       epilogue->force_vectorize = loop->force_vectorize;
9933       epilogue->dont_vectorize = false;
9934     }
9935
9936   return epilogue;
9937 }
9938
9939 /* The code below is trying to perform simple optimization - revert
9940    if-conversion for masked stores, i.e. if the mask of a store is zero
9941    do not perform it and all stored value producers also if possible.
9942    For example,
9943      for (i=0; i<n; i++)
9944        if (c[i])
9945         {
9946           p1[i] += 1;
9947           p2[i] = p3[i] +2;
9948         }
9949    this transformation will produce the following semi-hammock:
9950
9951    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
9952      {
9953        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
9954        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
9955        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
9956        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
9957        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
9958        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
9959      }
9960 */
9961
9962 void
9963 optimize_mask_stores (class loop *loop)
9964 {
9965   basic_block *bbs = get_loop_body (loop);
9966   unsigned nbbs = loop->num_nodes;
9967   unsigned i;
9968   basic_block bb;
9969   class loop *bb_loop;
9970   gimple_stmt_iterator gsi;
9971   gimple *stmt;
9972   auto_vec<gimple *> worklist;
9973   auto_purge_vect_location sentinel;
9974
9975   vect_location = find_loop_location (loop);
9976   /* Pick up all masked stores in loop if any.  */
9977   for (i = 0; i < nbbs; i++)
9978     {
9979       bb = bbs[i];
9980       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
9981            gsi_next (&gsi))
9982         {
9983           stmt = gsi_stmt (gsi);
9984           if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
9985             worklist.safe_push (stmt);
9986         }
9987     }
9988
9989   free (bbs);
9990   if (worklist.is_empty ())
9991     return;
9992
9993   /* Loop has masked stores.  */
9994   while (!worklist.is_empty ())
9995     {
9996       gimple *last, *last_store;
9997       edge e, efalse;
9998       tree mask;
9999       basic_block store_bb, join_bb;
10000       gimple_stmt_iterator gsi_to;
10001       tree vdef, new_vdef;
10002       gphi *phi;
10003       tree vectype;
10004       tree zero;
10005
10006       last = worklist.pop ();
10007       mask = gimple_call_arg (last, 2);
10008       bb = gimple_bb (last);
10009       /* Create then_bb and if-then structure in CFG, then_bb belongs to
10010          the same loop as if_bb.  It could be different to LOOP when two
10011          level loop-nest is vectorized and mask_store belongs to the inner
10012          one.  */
10013       e = split_block (bb, last);
10014       bb_loop = bb->loop_father;
10015       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
10016       join_bb = e->dest;
10017       store_bb = create_empty_bb (bb);
10018       add_bb_to_loop (store_bb, bb_loop);
10019       e->flags = EDGE_TRUE_VALUE;
10020       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
10021       /* Put STORE_BB to likely part.  */
10022       efalse->probability = profile_probability::unlikely ();
10023       store_bb->count = efalse->count ();
10024       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
10025       if (dom_info_available_p (CDI_DOMINATORS))
10026         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
10027       if (dump_enabled_p ())
10028         dump_printf_loc (MSG_NOTE, vect_location,
10029                          "Create new block %d to sink mask stores.",
10030                          store_bb->index);
10031       /* Create vector comparison with boolean result.  */
10032       vectype = TREE_TYPE (mask);
10033       zero = build_zero_cst (vectype);
10034       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
10035       gsi = gsi_last_bb (bb);
10036       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
10037       /* Create new PHI node for vdef of the last masked store:
10038          .MEM_2 = VDEF <.MEM_1>
10039          will be converted to
10040          .MEM.3 = VDEF <.MEM_1>
10041          and new PHI node will be created in join bb
10042          .MEM_2 = PHI <.MEM_1, .MEM_3>
10043       */
10044       vdef = gimple_vdef (last);
10045       new_vdef = make_ssa_name (gimple_vop (cfun), last);
10046       gimple_set_vdef (last, new_vdef);
10047       phi = create_phi_node (vdef, join_bb);
10048       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
10049
10050       /* Put all masked stores with the same mask to STORE_BB if possible.  */
10051       while (true)
10052         {
10053           gimple_stmt_iterator gsi_from;
10054           gimple *stmt1 = NULL;
10055
10056           /* Move masked store to STORE_BB.  */
10057           last_store = last;
10058           gsi = gsi_for_stmt (last);
10059           gsi_from = gsi;
10060           /* Shift GSI to the previous stmt for further traversal.  */
10061           gsi_prev (&gsi);
10062           gsi_to = gsi_start_bb (store_bb);
10063           gsi_move_before (&gsi_from, &gsi_to);
10064           /* Setup GSI_TO to the non-empty block start.  */
10065           gsi_to = gsi_start_bb (store_bb);
10066           if (dump_enabled_p ())
10067             dump_printf_loc (MSG_NOTE, vect_location,
10068                              "Move stmt to created bb\n%G", last);
10069           /* Move all stored value producers if possible.  */
10070           while (!gsi_end_p (gsi))
10071             {
10072               tree lhs;
10073               imm_use_iterator imm_iter;
10074               use_operand_p use_p;
10075               bool res;
10076
10077               /* Skip debug statements.  */
10078               if (is_gimple_debug (gsi_stmt (gsi)))
10079                 {
10080                   gsi_prev (&gsi);
10081                   continue;
10082                 }
10083               stmt1 = gsi_stmt (gsi);
10084               /* Do not consider statements writing to memory or having
10085                  volatile operand.  */
10086               if (gimple_vdef (stmt1)
10087                   || gimple_has_volatile_ops (stmt1))
10088                 break;
10089               gsi_from = gsi;
10090               gsi_prev (&gsi);
10091               lhs = gimple_get_lhs (stmt1);
10092               if (!lhs)
10093                 break;
10094
10095               /* LHS of vectorized stmt must be SSA_NAME.  */
10096               if (TREE_CODE (lhs) != SSA_NAME)
10097                 break;
10098
10099               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
10100                 {
10101                   /* Remove dead scalar statement.  */
10102                   if (has_zero_uses (lhs))
10103                     {
10104                       gsi_remove (&gsi_from, true);
10105                       continue;
10106                     }
10107                 }
10108
10109               /* Check that LHS does not have uses outside of STORE_BB.  */
10110               res = true;
10111               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
10112                 {
10113                   gimple *use_stmt;
10114                   use_stmt = USE_STMT (use_p);
10115                   if (is_gimple_debug (use_stmt))
10116                     continue;
10117                   if (gimple_bb (use_stmt) != store_bb)
10118                     {
10119                       res = false;
10120                       break;
10121                     }
10122                 }
10123               if (!res)
10124                 break;
10125
10126               if (gimple_vuse (stmt1)
10127                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
10128                 break;
10129
10130               /* Can move STMT1 to STORE_BB.  */
10131               if (dump_enabled_p ())
10132                 dump_printf_loc (MSG_NOTE, vect_location,
10133                                  "Move stmt to created bb\n%G", stmt1);
10134               gsi_move_before (&gsi_from, &gsi_to);
10135               /* Shift GSI_TO for further insertion.  */
10136               gsi_prev (&gsi_to);
10137             }
10138           /* Put other masked stores with the same mask to STORE_BB.  */
10139           if (worklist.is_empty ()
10140               || gimple_call_arg (worklist.last (), 2) != mask
10141               || worklist.last () != stmt1)
10142             break;
10143           last = worklist.pop ();
10144         }
10145       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
10146     }
10147 }
10148
10149 /* Decide whether it is possible to use a zero-based induction variable
10150    when vectorizing LOOP_VINFO with partial vectors.  If it is, return
10151    the value that the induction variable must be able to hold in order
10152    to ensure that the rgroups eventually have no active vector elements.
10153    Return -1 otherwise.  */
10154
10155 widest_int
10156 vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
10157 {
10158   tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
10159   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10160   unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
10161
10162   /* Calculate the value that the induction variable must be able
10163      to hit in order to ensure that we end the loop with an all-false mask.
10164      This involves adding the maximum number of inactive trailing scalar
10165      iterations.  */
10166   widest_int iv_limit = -1;
10167   if (max_loop_iterations (loop, &iv_limit))
10168     {
10169       if (niters_skip)
10170         {
10171           /* Add the maximum number of skipped iterations to the
10172              maximum iteration count.  */
10173           if (TREE_CODE (niters_skip) == INTEGER_CST)
10174             iv_limit += wi::to_widest (niters_skip);
10175           else
10176             iv_limit += max_vf - 1;
10177         }
10178       else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
10179         /* Make a conservatively-correct assumption.  */
10180         iv_limit += max_vf - 1;
10181
10182       /* IV_LIMIT is the maximum number of latch iterations, which is also
10183          the maximum in-range IV value.  Round this value down to the previous
10184          vector alignment boundary and then add an extra full iteration.  */
10185       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
10186       iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
10187     }
10188   return iv_limit;
10189 }
10190
10191 /* For the given rgroup_controls RGC, check whether an induction variable
10192    would ever hit a value that produces a set of all-false masks or zero
10193    lengths before wrapping around.  Return true if it's possible to wrap
10194    around before hitting the desirable value, otherwise return false.  */
10195
10196 bool
10197 vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
10198 {
10199   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
10200
10201   if (iv_limit == -1)
10202     return true;
10203
10204   tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
10205   unsigned int compare_precision = TYPE_PRECISION (compare_type);
10206   unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
10207
10208   if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
10209     return true;
10210
10211   return false;
10212 }