gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2021 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #define INCLUDE_ALGORITHM
  23 #include "config.h"
  24 #include "system.h"
  25 #include "coretypes.h"
  26 #include "backend.h"
  27 #include "target.h"
  28 #include "rtl.h"
  29 #include "tree.h"
  30 #include "gimple.h"
  31 #include "cfghooks.h"
  32 #include "tree-pass.h"
  33 #include "ssa.h"
  34 #include "optabs-tree.h"
  35 #include "diagnostic-core.h"
  36 #include "fold-const.h"
  37 #include "stor-layout.h"
  38 #include "cfganal.h"
  39 #include "gimplify.h"
  40 #include "gimple-iterator.h"
  41 #include "gimplify-me.h"
  42 #include "tree-ssa-loop-ivopts.h"
  43 #include "tree-ssa-loop-manip.h"
  44 #include "tree-ssa-loop-niter.h"
  45 #include "tree-ssa-loop.h"
  46 #include "cfgloop.h"
  47 #include "tree-scalar-evolution.h"
  48 #include "tree-vectorizer.h"
  49 #include "gimple-fold.h"
  50 #include "cgraph.h"
  51 #include "tree-cfg.h"
  52 #include "tree-if-conv.h"
  53 #include "internal-fn.h"
  54 #include "tree-vector-builder.h"
  55 #include "vec-perm-indices.h"
  56 #include "tree-eh.h"
  57
  58 /* Loop Vectorization Pass.
  59
  60    This pass tries to vectorize loops.
  61
  62    For example, the vectorizer transforms the following simple loop:
  63
  64         short a[N]; short b[N]; short c[N]; int i;
  65
  66         for (i=0; i<N; i++){
  67           a[i] = b[i] + c[i];
  68         }
  69
  70    as if it was manually vectorized by rewriting the source code into:
  71
  72         typedef int __attribute__((mode(V8HI))) v8hi;
  73         short a[N];  short b[N]; short c[N];   int i;
  74         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  75         v8hi va, vb, vc;
  76
  77         for (i=0; i<N/8; i++){
  78           vb = pb[i];
  79           vc = pc[i];
  80           va = vb + vc;
  81           pa[i] = va;
  82         }
  83
  84         The main entry to this pass is vectorize_loops(), in which
  85    the vectorizer applies a set of analyses on a given set of loops,
  86    followed by the actual vectorization transformation for the loops that
  87    had successfully passed the analysis phase.
  88         Throughout this pass we make a distinction between two types of
  89    data: scalars (which are represented by SSA_NAMES), and memory references
  90    ("data-refs").  These two types of data require different handling both
  91    during analysis and transformation. The types of data-refs that the
  92    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  93    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  94    accesses are required to have a simple (consecutive) access pattern.
  95
  96    Analysis phase:
  97    ===============
  98         The driver for the analysis phase is vect_analyze_loop().
  99    It applies a set of analyses, some of which rely on the scalar evolution
 100    analyzer (scev) developed by Sebastian Pop.
 101
 102         During the analysis phase the vectorizer records some information
 103    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 104    loop, as well as general information about the loop as a whole, which is
 105    recorded in a "loop_vec_info" struct attached to each loop.
 106
 107    Transformation phase:
 108    =====================
 109         The loop transformation phase scans all the stmts in the loop, and
 110    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 111    the loop that needs to be vectorized.  It inserts the vector code sequence
 112    just before the scalar stmt S, and records a pointer to the vector code
 113    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 114    attached to S).  This pointer will be used for the vectorization of following
 115    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 116    otherwise, we rely on dead code elimination for removing it.
 117
 118         For example, say stmt S1 was vectorized into stmt VS1:
 119
 120    VS1: vb = px[i];
 121    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 122    S2:  a = b;
 123
 124    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 125    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 126    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 127    resulting sequence would be:
 128
 129    VS1: vb = px[i];
 130    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 131    VS2: va = vb;
 132    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 133
 134         Operands that are not SSA_NAMEs, are data-refs that appear in
 135    load/store operations (like 'x[i]' in S1), and are handled differently.
 136
 137    Target modeling:
 138    =================
 139         Currently the only target specific information that is used is the
 140    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 141    Targets that can support different sizes of vectors, for now will need
 142    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 143    flexibility will be added in the future.
 144
 145         Since we only vectorize operations which vector form can be
 146    expressed using existing tree codes, to verify that an operation is
 147    supported, the vectorizer checks the relevant optab at the relevant
 148    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 149    the value found is CODE_FOR_nothing, then there's no target support, and
 150    we can't vectorize the stmt.
 151
 152    For additional information on this project see:
 153    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 154 */
 155
 156 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 157 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
 158                                                bool *, bool *);
 159
 160 /* Subroutine of vect_determine_vf_for_stmt that handles only one
 161    statement.  VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
 162    may already be set for general statements (not just data refs).  */
 163
 164 static opt_result
 165 vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
 166                               bool vectype_maybe_set_p,
 167                               poly_uint64 *vf)
 168 {
 169   gimple *stmt = stmt_info->stmt;
 170
 171   if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 172        && !STMT_VINFO_LIVE_P (stmt_info))
 173       || gimple_clobber_p (stmt))
 174     {
 175       if (dump_enabled_p ())
 176         dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 177       return opt_result::success ();
 178     }
 179
 180   tree stmt_vectype, nunits_vectype;
 181   opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info,
 182                                                    &stmt_vectype,
 183                                                    &nunits_vectype);
 184   if (!res)
 185     return res;
 186
 187   if (stmt_vectype)
 188     {
 189       if (STMT_VINFO_VECTYPE (stmt_info))
 190         /* The only case when a vectype had been already set is for stmts
 191            that contain a data ref, or for "pattern-stmts" (stmts generated
 192            by the vectorizer to represent/replace a certain idiom).  */
 193         gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
 194                      || vectype_maybe_set_p)
 195                     && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
 196       else
 197         STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
 198     }
 199
 200   if (nunits_vectype)
 201     vect_update_max_nunits (vf, nunits_vectype);
 202
 203   return opt_result::success ();
 204 }
 205
 206 /* Subroutine of vect_determine_vectorization_factor.  Set the vector
 207    types of STMT_INFO and all attached pattern statements and update
 208    the vectorization factor VF accordingly.  Return true on success
 209    or false if something prevented vectorization.  */
 210
 211 static opt_result
 212 vect_determine_vf_for_stmt (vec_info *vinfo,
 213                             stmt_vec_info stmt_info, poly_uint64 *vf)
 214 {
 215   if (dump_enabled_p ())
 216     dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
 217                      stmt_info->stmt);
 218   opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, false, vf);
 219   if (!res)
 220     return res;
 221
 222   if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 223       && STMT_VINFO_RELATED_STMT (stmt_info))
 224     {
 225       gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 226       stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
 227
 228       /* If a pattern statement has def stmts, analyze them too.  */
 229       for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
 230            !gsi_end_p (si); gsi_next (&si))
 231         {
 232           stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
 233           if (dump_enabled_p ())
 234             dump_printf_loc (MSG_NOTE, vect_location,
 235                              "==> examining pattern def stmt: %G",
 236                              def_stmt_info->stmt);
 237           res = vect_determine_vf_for_stmt_1 (vinfo, def_stmt_info, true, vf);
 238           if (!res)
 239             return res;
 240         }
 241
 242       if (dump_enabled_p ())
 243         dump_printf_loc (MSG_NOTE, vect_location,
 244                          "==> examining pattern statement: %G",
 245                          stmt_info->stmt);
 246       res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, true, vf);
 247       if (!res)
 248         return res;
 249     }
 250
 251   return opt_result::success ();
 252 }
 253
 254 /* Function vect_determine_vectorization_factor
 255
 256    Determine the vectorization factor (VF).  VF is the number of data elements
 257    that are operated upon in parallel in a single iteration of the vectorized
 258    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 259    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 260    elements can fit in a single vector register.
 261
 262    We currently support vectorization of loops in which all types operated upon
 263    are of the same size.  Therefore this function currently sets VF according to
 264    the size of the types operated upon, and fails if there are multiple sizes
 265    in the loop.
 266
 267    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 268    original loop:
 269         for (i=0; i<N; i++){
 270           a[i] = b[i] + c[i];
 271         }
 272
 273    vectorized loop:
 274         for (i=0; i<N; i+=VF){
 275           a[i:VF] = b[i:VF] + c[i:VF];
 276         }
 277 */
 278
 279 static opt_result
 280 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 281 {
 282   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 283   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 284   unsigned nbbs = loop->num_nodes;
 285   poly_uint64 vectorization_factor = 1;
 286   tree scalar_type = NULL_TREE;
 287   gphi *phi;
 288   tree vectype;
 289   stmt_vec_info stmt_info;
 290   unsigned i;
 291
 292   DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
 293
 294   for (i = 0; i < nbbs; i++)
 295     {
 296       basic_block bb = bbs[i];
 297
 298       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 299            gsi_next (&si))
 300         {
 301           phi = si.phi ();
 302           stmt_info = loop_vinfo->lookup_stmt (phi);
 303           if (dump_enabled_p ())
 304             dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
 305                              phi);
 306
 307           gcc_assert (stmt_info);
 308
 309           if (STMT_VINFO_RELEVANT_P (stmt_info)
 310               || STMT_VINFO_LIVE_P (stmt_info))
 311             {
 312               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 313               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 314
 315               if (dump_enabled_p ())
 316                 dump_printf_loc (MSG_NOTE, vect_location,
 317                                  "get vectype for scalar type:  %T\n",
 318                                  scalar_type);
 319
 320               vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
 321               if (!vectype)
 322                 return opt_result::failure_at (phi,
 323                                                "not vectorized: unsupported "
 324                                                "data-type %T\n",
 325                                                scalar_type);
 326               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 327
 328               if (dump_enabled_p ())
 329                 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
 330                                  vectype);
 331
 332               if (dump_enabled_p ())
 333                 {
 334                   dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
 335                   dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
 336                   dump_printf (MSG_NOTE, "\n");
 337                 }
 338
 339               vect_update_max_nunits (&vectorization_factor, vectype);
 340             }
 341         }
 342
 343       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
 344            gsi_next (&si))
 345         {
 346           if (is_gimple_debug (gsi_stmt (si)))
 347             continue;
 348           stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
 349           opt_result res
 350             = vect_determine_vf_for_stmt (loop_vinfo,
 351                                           stmt_info, &vectorization_factor);
 352           if (!res)
 353             return res;
 354         }
 355     }
 356
 357   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 358   if (dump_enabled_p ())
 359     {
 360       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
 361       dump_dec (MSG_NOTE, vectorization_factor);
 362       dump_printf (MSG_NOTE, "\n");
 363     }
 364
 365   if (known_le (vectorization_factor, 1U))
 366     return opt_result::failure_at (vect_location,
 367                                    "not vectorized: unsupported data-type\n");
 368   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 369   return opt_result::success ();
 370 }
 371
 372
 373 /* Function vect_is_simple_iv_evolution.
 374
 375    FORNOW: A simple evolution of an induction variables in the loop is
 376    considered a polynomial evolution.  */
 377
 378 static bool
 379 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 380                              tree * step)
 381 {
 382   tree init_expr;
 383   tree step_expr;
 384   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 385   basic_block bb;
 386
 387   /* When there is no evolution in this loop, the evolution function
 388      is not "simple".  */
 389   if (evolution_part == NULL_TREE)
 390     return false;
 391
 392   /* When the evolution is a polynomial of degree >= 2
 393      the evolution function is not "simple".  */
 394   if (tree_is_chrec (evolution_part))
 395     return false;
 396
 397   step_expr = evolution_part;
 398   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 399
 400   if (dump_enabled_p ())
 401     dump_printf_loc (MSG_NOTE, vect_location, "step: %T,  init: %T\n",
 402                      step_expr, init_expr);
 403
 404   *init = init_expr;
 405   *step = step_expr;
 406
 407   if (TREE_CODE (step_expr) != INTEGER_CST
 408       && (TREE_CODE (step_expr) != SSA_NAME
 409           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 410               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 411           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 412               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 413                   || !flag_associative_math)))
 414       && (TREE_CODE (step_expr) != REAL_CST
 415           || !flag_associative_math))
 416     {
 417       if (dump_enabled_p ())
 418         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 419                          "step unknown.\n");
 420       return false;
 421     }
 422
 423   return true;
 424 }
 425
 426 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
 427    what we are assuming is a double reduction.  For example, given
 428    a structure like this:
 429
 430       outer1:
 431         x_1 = PHI <x_4(outer2), ...>;
 432         ...
 433
 434       inner:
 435         x_2 = PHI <x_1(outer1), ...>;
 436         ...
 437         x_3 = ...;
 438         ...
 439
 440       outer2:
 441         x_4 = PHI <x_3(inner)>;
 442         ...
 443
 444    outer loop analysis would treat x_1 as a double reduction phi and
 445    this function would then return true for x_2.  */
 446
 447 static bool
 448 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
 449 {
 450   use_operand_p use_p;
 451   ssa_op_iter op_iter;
 452   FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
 453     if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
 454       if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
 455         return true;
 456   return false;
 457 }
 458
 459 /* Function vect_analyze_scalar_cycles_1.
 460
 461    Examine the cross iteration def-use cycles of scalar variables
 462    in LOOP.  LOOP_VINFO represents the loop that is now being
 463    considered for vectorization (can be LOOP, or an outer-loop
 464    enclosing LOOP).  */
 465
 466 static void
 467 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop)
 468 {
 469   basic_block bb = loop->header;
 470   tree init, step;
 471   auto_vec<stmt_vec_info, 64> worklist;
 472   gphi_iterator gsi;
 473   bool double_reduc, reduc_chain;
 474
 475   DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
 476
 477   /* First - identify all inductions.  Reduction detection assumes that all the
 478      inductions have been identified, therefore, this order must not be
 479      changed.  */
 480   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 481     {
 482       gphi *phi = gsi.phi ();
 483       tree access_fn = NULL;
 484       tree def = PHI_RESULT (phi);
 485       stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
 486
 487       if (dump_enabled_p ())
 488         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
 489
 490       /* Skip virtual phi's.  The data dependences that are associated with
 491          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 492       if (virtual_operand_p (def))
 493         continue;
 494
 495       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 496
 497       /* Analyze the evolution function.  */
 498       access_fn = analyze_scalar_evolution (loop, def);
 499       if (access_fn)
 500         {
 501           STRIP_NOPS (access_fn);
 502           if (dump_enabled_p ())
 503             dump_printf_loc (MSG_NOTE, vect_location,
 504                              "Access function of PHI: %T\n", access_fn);
 505           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 506             = initial_condition_in_loop_num (access_fn, loop->num);
 507           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 508             = evolution_part_in_loop_num (access_fn, loop->num);
 509         }
 510
 511       if (!access_fn
 512           || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
 513           || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
 514           || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 515               && TREE_CODE (step) != INTEGER_CST))
 516         {
 517           worklist.safe_push (stmt_vinfo);
 518           continue;
 519         }
 520
 521       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 522                   != NULL_TREE);
 523       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 524
 525       if (dump_enabled_p ())
 526         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 527       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 528     }
 529
 530
 531   /* Second - identify all reductions and nested cycles.  */
 532   while (worklist.length () > 0)
 533     {
 534       stmt_vec_info stmt_vinfo = worklist.pop ();
 535       gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
 536       tree def = PHI_RESULT (phi);
 537
 538       if (dump_enabled_p ())
 539         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
 540
 541       gcc_assert (!virtual_operand_p (def)
 542                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 543
 544       stmt_vec_info reduc_stmt_info
 545         = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
 546                                     &reduc_chain);
 547       if (reduc_stmt_info)
 548         {
 549           STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
 550           STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
 551           if (double_reduc)
 552             {
 553               if (dump_enabled_p ())
 554                 dump_printf_loc (MSG_NOTE, vect_location,
 555                                  "Detected double reduction.\n");
 556
 557               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 558               STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
 559             }
 560           else
 561             {
 562               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
 563                 {
 564                   if (dump_enabled_p ())
 565                     dump_printf_loc (MSG_NOTE, vect_location,
 566                                      "Detected vectorizable nested cycle.\n");
 567
 568                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 569                 }
 570               else
 571                 {
 572                   if (dump_enabled_p ())
 573                     dump_printf_loc (MSG_NOTE, vect_location,
 574                                      "Detected reduction.\n");
 575
 576                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 577                   STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
 578                   /* Store the reduction cycles for possible vectorization in
 579                      loop-aware SLP if it was not detected as reduction
 580                      chain.  */
 581                   if (! reduc_chain)
 582                     LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
 583                       (reduc_stmt_info);
 584                 }
 585             }
 586         }
 587       else
 588         if (dump_enabled_p ())
 589           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 590                            "Unknown def-use cycle pattern.\n");
 591     }
 592 }
 593
 594
 595 /* Function vect_analyze_scalar_cycles.
 596
 597    Examine the cross iteration def-use cycles of scalar variables, by
 598    analyzing the loop-header PHIs of scalar variables.  Classify each
 599    cycle as one of the following: invariant, induction, reduction, unknown.
 600    We do that for the loop represented by LOOP_VINFO, and also to its
 601    inner-loop, if exists.
 602    Examples for scalar cycles:
 603
 604    Example1: reduction:
 605
 606               loop1:
 607               for (i=0; i<N; i++)
 608                  sum += a[i];
 609
 610    Example2: induction:
 611
 612               loop2:
 613               for (i=0; i<N; i++)
 614                  a[i] = i;  */
 615
 616 static void
 617 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 618 {
 619   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 620
 621   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 622
 623   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 624      Reductions in such inner-loop therefore have different properties than
 625      the reductions in the nest that gets vectorized:
 626      1. When vectorized, they are executed in the same order as in the original
 627         scalar loop, so we can't change the order of computation when
 628         vectorizing them.
 629      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 630         current checks are too strict.  */
 631
 632   if (loop->inner)
 633     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 634 }
 635
 636 /* Transfer group and reduction information from STMT_INFO to its
 637    pattern stmt.  */
 638
 639 static void
 640 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
 641 {
 642   stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
 643   stmt_vec_info stmtp;
 644   gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
 645               && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
 646   REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
 647   do
 648     {
 649       stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
 650       gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
 651                            == STMT_VINFO_DEF_TYPE (stmt_info));
 652       REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
 653       stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
 654       if (stmt_info)
 655         REDUC_GROUP_NEXT_ELEMENT (stmtp)
 656           = STMT_VINFO_RELATED_STMT (stmt_info);
 657     }
 658   while (stmt_info);
 659 }
 660
 661 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 662
 663 static void
 664 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 665 {
 666   stmt_vec_info first;
 667   unsigned i;
 668
 669   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 670     {
 671       stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
 672       while (next)
 673         {
 674           if ((STMT_VINFO_IN_PATTERN_P (next)
 675                != STMT_VINFO_IN_PATTERN_P (first))
 676               || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next)) == -1)
 677             break;
 678           next = REDUC_GROUP_NEXT_ELEMENT (next);
 679         }
 680       /* If all reduction chain members are well-formed patterns adjust
 681          the group to group the pattern stmts instead.  */
 682       if (! next
 683           && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first)) != -1)
 684         {
 685           if (STMT_VINFO_IN_PATTERN_P (first))
 686             {
 687               vect_fixup_reduc_chain (first);
 688               LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
 689                 = STMT_VINFO_RELATED_STMT (first);
 690             }
 691         }
 692       /* If not all stmt in the chain are patterns or if we failed
 693          to update STMT_VINFO_REDUC_IDX dissolve the chain and handle
 694          it as regular reduction instead.  */
 695       else
 696         {
 697           stmt_vec_info vinfo = first;
 698           stmt_vec_info last = NULL;
 699           while (vinfo)
 700             {
 701               next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
 702               REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
 703               REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
 704               last = vinfo;
 705               vinfo = next;
 706             }
 707           STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first))
 708             = vect_internal_def;
 709           loop_vinfo->reductions.safe_push (vect_stmt_to_vectorize (last));
 710           LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).unordered_remove (i);
 711           --i;
 712         }
 713     }
 714 }
 715
 716 /* Function vect_get_loop_niters.
 717
 718    Determine how many iterations the loop is executed and place it
 719    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
 720    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
 721    niter information holds in ASSUMPTIONS.
 722
 723    Return the loop exit condition.  */
 724
 725
 726 static gcond *
 727 vect_get_loop_niters (class loop *loop, tree *assumptions,
 728                       tree *number_of_iterations, tree *number_of_iterationsm1)
 729 {
 730   edge exit = single_exit (loop);
 731   class tree_niter_desc niter_desc;
 732   tree niter_assumptions, niter, may_be_zero;
 733   gcond *cond = get_loop_exit_condition (loop);
 734
 735   *assumptions = boolean_true_node;
 736   *number_of_iterationsm1 = chrec_dont_know;
 737   *number_of_iterations = chrec_dont_know;
 738   DUMP_VECT_SCOPE ("get_loop_niters");
 739
 740   if (!exit)
 741     return cond;
 742
 743   may_be_zero = NULL_TREE;
 744   if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
 745       || chrec_contains_undetermined (niter_desc.niter))
 746     return cond;
 747
 748   niter_assumptions = niter_desc.assumptions;
 749   may_be_zero = niter_desc.may_be_zero;
 750   niter = niter_desc.niter;
 751
 752   if (may_be_zero && integer_zerop (may_be_zero))
 753     may_be_zero = NULL_TREE;
 754
 755   if (may_be_zero)
 756     {
 757       if (COMPARISON_CLASS_P (may_be_zero))
 758         {
 759           /* Try to combine may_be_zero with assumptions, this can simplify
 760              computation of niter expression.  */
 761           if (niter_assumptions && !integer_nonzerop (niter_assumptions))
 762             niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
 763                                              niter_assumptions,
 764                                              fold_build1 (TRUTH_NOT_EXPR,
 765                                                           boolean_type_node,
 766                                                           may_be_zero));
 767           else
 768             niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
 769                                  build_int_cst (TREE_TYPE (niter), 0),
 770                                  rewrite_to_non_trapping_overflow (niter));
 771
 772           may_be_zero = NULL_TREE;
 773         }
 774       else if (integer_nonzerop (may_be_zero))
 775         {
 776           *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
 777           *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
 778           return cond;
 779         }
 780       else
 781         return cond;
 782     }
 783
 784   *assumptions = niter_assumptions;
 785   *number_of_iterationsm1 = niter;
 786
 787   /* We want the number of loop header executions which is the number
 788      of latch executions plus one.
 789      ???  For UINT_MAX latch executions this number overflows to zero
 790      for loops like do { n++; } while (n != 0);  */
 791   if (niter && !chrec_contains_undetermined (niter))
 792     niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
 793                           build_int_cst (TREE_TYPE (niter), 1));
 794   *number_of_iterations = niter;
 795
 796   return cond;
 797 }
 798
 799 /* Function bb_in_loop_p
 800
 801    Used as predicate for dfs order traversal of the loop bbs.  */
 802
 803 static bool
 804 bb_in_loop_p (const_basic_block bb, const void *data)
 805 {
 806   const class loop *const loop = (const class loop *)data;
 807   if (flow_bb_inside_loop_p (loop, bb))
 808     return true;
 809   return false;
 810 }
 811
 812
 813 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
 814    stmt_vec_info structs for all the stmts in LOOP_IN.  */
 815
 816 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
 817   : vec_info (vec_info::loop, shared),
 818     loop (loop_in),
 819     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
 820     num_itersm1 (NULL_TREE),
 821     num_iters (NULL_TREE),
 822     num_iters_unchanged (NULL_TREE),
 823     num_iters_assumptions (NULL_TREE),
 824     th (0),
 825     versioning_threshold (0),
 826     vectorization_factor (0),
 827     main_loop_edge (nullptr),
 828     skip_main_loop_edge (nullptr),
 829     skip_this_loop_edge (nullptr),
 830     reusable_accumulators (),
 831     max_vectorization_factor (0),
 832     mask_skip_niters (NULL_TREE),
 833     rgroup_compare_type (NULL_TREE),
 834     simd_if_cond (NULL_TREE),
 835     unaligned_dr (NULL),
 836     peeling_for_alignment (0),
 837     ptr_mask (0),
 838     ivexpr_map (NULL),
 839     scan_map (NULL),
 840     slp_unrolling_factor (1),
 841     single_scalar_iteration_cost (0),
 842     vec_outside_cost (0),
 843     vec_inside_cost (0),
 844     inner_loop_cost_factor (param_vect_inner_loop_cost_factor),
 845     vectorizable (false),
 846     can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
 847     using_partial_vectors_p (false),
 848     epil_using_partial_vectors_p (false),
 849     peeling_for_gaps (false),
 850     peeling_for_niter (false),
 851     no_data_dependencies (false),
 852     has_mask_store (false),
 853     scalar_loop_scaling (profile_probability::uninitialized ()),
 854     scalar_loop (NULL),
 855     orig_loop_info (NULL)
 856 {
 857   /* CHECKME: We want to visit all BBs before their successors (except for
 858      latch blocks, for which this assertion wouldn't hold).  In the simple
 859      case of the loop forms we allow, a dfs order of the BBs would the same
 860      as reversed postorder traversal, so we are safe.  */
 861
 862   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 863                                           bbs, loop->num_nodes, loop);
 864   gcc_assert (nbbs == loop->num_nodes);
 865
 866   for (unsigned int i = 0; i < nbbs; i++)
 867     {
 868       basic_block bb = bbs[i];
 869       gimple_stmt_iterator si;
 870
 871       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 872         {
 873           gimple *phi = gsi_stmt (si);
 874           gimple_set_uid (phi, 0);
 875           add_stmt (phi);
 876         }
 877
 878       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 879         {
 880           gimple *stmt = gsi_stmt (si);
 881           gimple_set_uid (stmt, 0);
 882           if (is_gimple_debug (stmt))
 883             continue;
 884           add_stmt (stmt);
 885           /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
 886              third argument is the #pragma omp simd if (x) condition, when 0,
 887              loop shouldn't be vectorized, when non-zero constant, it should
 888              be vectorized normally, otherwise versioned with vectorized loop
 889              done if the condition is non-zero at runtime.  */
 890           if (loop_in->simduid
 891               && is_gimple_call (stmt)
 892               && gimple_call_internal_p (stmt)
 893               && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
 894               && gimple_call_num_args (stmt) >= 3
 895               && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
 896               && (loop_in->simduid
 897                   == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
 898             {
 899               tree arg = gimple_call_arg (stmt, 2);
 900               if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
 901                 simd_if_cond = arg;
 902               else
 903                 gcc_assert (integer_nonzerop (arg));
 904             }
 905         }
 906     }
 907
 908   epilogue_vinfos.create (6);
 909 }
 910
 911 /* Free all levels of rgroup CONTROLS.  */
 912
 913 void
 914 release_vec_loop_controls (vec<rgroup_controls> *controls)
 915 {
 916   rgroup_controls *rgc;
 917   unsigned int i;
 918   FOR_EACH_VEC_ELT (*controls, i, rgc)
 919     rgc->controls.release ();
 920   controls->release ();
 921 }
 922
 923 /* Free all memory used by the _loop_vec_info, as well as all the
 924    stmt_vec_info structs of all the stmts in the loop.  */
 925
 926 _loop_vec_info::~_loop_vec_info ()
 927 {
 928   free (bbs);
 929
 930   release_vec_loop_controls (&masks);
 931   release_vec_loop_controls (&lens);
 932   delete ivexpr_map;
 933   delete scan_map;
 934   epilogue_vinfos.release ();
 935
 936   /* When we release an epiloge vinfo that we do not intend to use
 937      avoid clearing AUX of the main loop which should continue to
 938      point to the main loop vinfo since otherwise we'll leak that.  */
 939   if (loop->aux == this)
 940     loop->aux = NULL;
 941 }
 942
 943 /* Return an invariant or register for EXPR and emit necessary
 944    computations in the LOOP_VINFO loop preheader.  */
 945
 946 tree
 947 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
 948 {
 949   if (is_gimple_reg (expr)
 950       || is_gimple_min_invariant (expr))
 951     return expr;
 952
 953   if (! loop_vinfo->ivexpr_map)
 954     loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
 955   tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
 956   if (! cached)
 957     {
 958       gimple_seq stmts = NULL;
 959       cached = force_gimple_operand (unshare_expr (expr),
 960                                      &stmts, true, NULL_TREE);
 961       if (stmts)
 962         {
 963           edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
 964           gsi_insert_seq_on_edge_immediate (e, stmts);
 965         }
 966     }
 967   return cached;
 968 }
 969
 970 /* Return true if we can use CMP_TYPE as the comparison type to produce
 971    all masks required to mask LOOP_VINFO.  */
 972
 973 static bool
 974 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
 975 {
 976   rgroup_controls *rgm;
 977   unsigned int i;
 978   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
 979     if (rgm->type != NULL_TREE
 980         && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
 981                                             cmp_type, rgm->type,
 982                                             OPTIMIZE_FOR_SPEED))
 983       return false;
 984   return true;
 985 }
 986
 987 /* Calculate the maximum number of scalars per iteration for every
 988    rgroup in LOOP_VINFO.  */
 989
 990 static unsigned int
 991 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
 992 {
 993   unsigned int res = 1;
 994   unsigned int i;
 995   rgroup_controls *rgm;
 996   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
 997     res = MAX (res, rgm->max_nscalars_per_iter);
 998   return res;
 999 }
1000
1001 /* Calculate the minimum precision necessary to represent:
1002
1003       MAX_NITERS * FACTOR
1004
1005    as an unsigned integer, where MAX_NITERS is the maximum number of
1006    loop header iterations for the original scalar form of LOOP_VINFO.  */
1007
1008 static unsigned
1009 vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
1010 {
1011   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1012
1013   /* Get the maximum number of iterations that is representable
1014      in the counter type.  */
1015   tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1016   widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1017
1018   /* Get a more refined estimate for the number of iterations.  */
1019   widest_int max_back_edges;
1020   if (max_loop_iterations (loop, &max_back_edges))
1021     max_ni = wi::smin (max_ni, max_back_edges + 1);
1022
1023   /* Work out how many bits we need to represent the limit.  */
1024   return wi::min_precision (max_ni * factor, UNSIGNED);
1025 }
1026
1027 /* True if the loop needs peeling or partial vectors when vectorized.  */
1028
1029 static bool
1030 vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
1031 {
1032   unsigned HOST_WIDE_INT const_vf;
1033   HOST_WIDE_INT max_niter
1034     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1035
1036   unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1037   if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1038     th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1039                                           (loop_vinfo));
1040
1041   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1042       && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1043     {
1044       /* Work out the (constant) number of iterations that need to be
1045          peeled for reasons other than niters.  */
1046       unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1047       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1048         peel_niter += 1;
1049       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1050                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1051         return true;
1052     }
1053   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1054       /* ??? When peeling for gaps but not alignment, we could
1055          try to check whether the (variable) niters is known to be
1056          VF * N + 1.  That's something of a niche case though.  */
1057       || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1058       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1059       || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1060            < (unsigned) exact_log2 (const_vf))
1061           /* In case of versioning, check if the maximum number of
1062              iterations is greater than th.  If they are identical,
1063              the epilogue is unnecessary.  */
1064           && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1065               || ((unsigned HOST_WIDE_INT) max_niter
1066                   > (th / const_vf) * const_vf))))
1067     return true;
1068
1069   return false;
1070 }
1071
1072 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1073    whether we can actually generate the masks required.  Return true if so,
1074    storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE.  */
1075
1076 static bool
1077 vect_verify_full_masking (loop_vec_info loop_vinfo)
1078 {
1079   unsigned int min_ni_width;
1080   unsigned int max_nscalars_per_iter
1081     = vect_get_max_nscalars_per_iter (loop_vinfo);
1082
1083   /* Use a normal loop if there are no statements that need masking.
1084      This only happens in rare degenerate cases: it means that the loop
1085      has no loads, no stores, and no live-out values.  */
1086   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1087     return false;
1088
1089   /* Work out how many bits we need to represent the limit.  */
1090   min_ni_width
1091     = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1092
1093   /* Find a scalar mode for which WHILE_ULT is supported.  */
1094   opt_scalar_int_mode cmp_mode_iter;
1095   tree cmp_type = NULL_TREE;
1096   tree iv_type = NULL_TREE;
1097   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1098   unsigned int iv_precision = UINT_MAX;
1099
1100   if (iv_limit != -1)
1101     iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1102                                       UNSIGNED);
1103
1104   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1105     {
1106       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1107       if (cmp_bits >= min_ni_width
1108           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1109         {
1110           tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1111           if (this_type
1112               && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1113             {
1114               /* Although we could stop as soon as we find a valid mode,
1115                  there are at least two reasons why that's not always the
1116                  best choice:
1117
1118                  - An IV that's Pmode or wider is more likely to be reusable
1119                    in address calculations than an IV that's narrower than
1120                    Pmode.
1121
1122                  - Doing the comparison in IV_PRECISION or wider allows
1123                    a natural 0-based IV, whereas using a narrower comparison
1124                    type requires mitigations against wrap-around.
1125
1126                  Conversely, if the IV limit is variable, doing the comparison
1127                  in a wider type than the original type can introduce
1128                  unnecessary extensions, so picking the widest valid mode
1129                  is not always a good choice either.
1130
1131                  Here we prefer the first IV type that's Pmode or wider,
1132                  and the first comparison type that's IV_PRECISION or wider.
1133                  (The comparison type must be no wider than the IV type,
1134                  to avoid extensions in the vector loop.)
1135
1136                  ??? We might want to try continuing beyond Pmode for ILP32
1137                  targets if CMP_BITS < IV_PRECISION.  */
1138               iv_type = this_type;
1139               if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1140                 cmp_type = this_type;
1141               if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1142                 break;
1143             }
1144         }
1145     }
1146
1147   if (!cmp_type)
1148     return false;
1149
1150   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1151   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1152   return true;
1153 }
1154
1155 /* Check whether we can use vector access with length based on precison
1156    comparison.  So far, to keep it simple, we only allow the case that the
1157    precision of the target supported length is larger than the precision
1158    required by loop niters.  */
1159
1160 static bool
1161 vect_verify_loop_lens (loop_vec_info loop_vinfo)
1162 {
1163   if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1164     return false;
1165
1166   unsigned int max_nitems_per_iter = 1;
1167   unsigned int i;
1168   rgroup_controls *rgl;
1169   /* Find the maximum number of items per iteration for every rgroup.  */
1170   FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1171     {
1172       unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1173       max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1174     }
1175
1176   /* Work out how many bits we need to represent the length limit.  */
1177   unsigned int min_ni_prec
1178     = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1179
1180   /* Now use the maximum of below precisions for one suitable IV type:
1181      - the IV's natural precision
1182      - the precision needed to hold: the maximum number of scalar
1183        iterations multiplied by the scale factor (min_ni_prec above)
1184      - the Pmode precision
1185
1186      If min_ni_prec is less than the precision of the current niters,
1187      we perfer to still use the niters type.  Prefer to use Pmode and
1188      wider IV to avoid narrow conversions.  */
1189
1190   unsigned int ni_prec
1191     = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1192   min_ni_prec = MAX (min_ni_prec, ni_prec);
1193   min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1194
1195   tree iv_type = NULL_TREE;
1196   opt_scalar_int_mode tmode_iter;
1197   FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1198     {
1199       scalar_mode tmode = tmode_iter.require ();
1200       unsigned int tbits = GET_MODE_BITSIZE (tmode);
1201
1202       /* ??? Do we really want to construct one IV whose precision exceeds
1203          BITS_PER_WORD?  */
1204       if (tbits > BITS_PER_WORD)
1205         break;
1206
1207       /* Find the first available standard integral type.  */
1208       if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1209         {
1210           iv_type = build_nonstandard_integer_type (tbits, true);
1211           break;
1212         }
1213     }
1214
1215   if (!iv_type)
1216     {
1217       if (dump_enabled_p ())
1218         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1219                          "can't vectorize with length-based partial vectors"
1220                          " because there is no suitable iv type.\n");
1221       return false;
1222     }
1223
1224   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1225   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1226
1227   return true;
1228 }
1229
1230 /* Calculate the cost of one scalar iteration of the loop.  */
1231 static void
1232 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1233 {
1234   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1235   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1236   int nbbs = loop->num_nodes, factor;
1237   int innerloop_iters, i;
1238
1239   DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1240
1241   /* Gather costs for statements in the scalar loop.  */
1242
1243   /* FORNOW.  */
1244   innerloop_iters = 1;
1245   if (loop->inner)
1246     innerloop_iters = LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo);
1247
1248   for (i = 0; i < nbbs; i++)
1249     {
1250       gimple_stmt_iterator si;
1251       basic_block bb = bbs[i];
1252
1253       if (bb->loop_father == loop->inner)
1254         factor = innerloop_iters;
1255       else
1256         factor = 1;
1257
1258       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1259         {
1260           gimple *stmt = gsi_stmt (si);
1261           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1262
1263           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1264             continue;
1265
1266           /* Skip stmts that are not vectorized inside the loop.  */
1267           stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1268           if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1269               && (!STMT_VINFO_LIVE_P (vstmt_info)
1270                   || !VECTORIZABLE_CYCLE_DEF
1271                         (STMT_VINFO_DEF_TYPE (vstmt_info))))
1272             continue;
1273
1274           vect_cost_for_stmt kind;
1275           if (STMT_VINFO_DATA_REF (stmt_info))
1276             {
1277               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1278                kind = scalar_load;
1279              else
1280                kind = scalar_store;
1281             }
1282           else if (vect_nop_conversion_p (stmt_info))
1283             continue;
1284           else
1285             kind = scalar_stmt;
1286
1287           /* We are using vect_prologue here to avoid scaling twice
1288              by the inner loop factor.  */
1289           record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1290                             factor, kind, stmt_info, 0, vect_prologue);
1291         }
1292     }
1293
1294   /* Now accumulate cost.  */
1295   vector_costs *target_cost_data = init_cost (loop_vinfo, true);
1296   stmt_info_for_cost *si;
1297   int j;
1298   FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1299                     j, si)
1300     (void) add_stmt_cost (target_cost_data, si->count,
1301                           si->kind, si->stmt_info, si->vectype,
1302                           si->misalign, si->where);
1303   unsigned prologue_cost = 0, body_cost = 0, epilogue_cost = 0;
1304   finish_cost (target_cost_data, &prologue_cost, &body_cost,
1305                &epilogue_cost);
1306   delete target_cost_data;
1307   LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo)
1308     = prologue_cost + body_cost + epilogue_cost;
1309 }
1310
1311
1312 /* Function vect_analyze_loop_form.
1313
1314    Verify that certain CFG restrictions hold, including:
1315    - the loop has a pre-header
1316    - the loop has a single entry and exit
1317    - the loop exit condition is simple enough
1318    - the number of iterations can be analyzed, i.e, a countable loop.  The
1319      niter could be analyzed under some assumptions.  */
1320
1321 opt_result
1322 vect_analyze_loop_form (class loop *loop, vect_loop_form_info *info)
1323 {
1324   DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1325
1326   /* Different restrictions apply when we are considering an inner-most loop,
1327      vs. an outer (nested) loop.
1328      (FORNOW. May want to relax some of these restrictions in the future).  */
1329
1330   info->inner_loop_cond = NULL;
1331   if (!loop->inner)
1332     {
1333       /* Inner-most loop.  We currently require that the number of BBs is
1334          exactly 2 (the header and latch).  Vectorizable inner-most loops
1335          look like this:
1336
1337                         (pre-header)
1338                            |
1339                           header <--------+
1340                            | |            |
1341                            | +--> latch --+
1342                            |
1343                         (exit-bb)  */
1344
1345       if (loop->num_nodes != 2)
1346         return opt_result::failure_at (vect_location,
1347                                        "not vectorized:"
1348                                        " control flow in loop.\n");
1349
1350       if (empty_block_p (loop->header))
1351         return opt_result::failure_at (vect_location,
1352                                        "not vectorized: empty loop.\n");
1353     }
1354   else
1355     {
1356       class loop *innerloop = loop->inner;
1357       edge entryedge;
1358
1359       /* Nested loop. We currently require that the loop is doubly-nested,
1360          contains a single inner loop, and the number of BBs is exactly 5.
1361          Vectorizable outer-loops look like this:
1362
1363                         (pre-header)
1364                            |
1365                           header <---+
1366                            |         |
1367                           inner-loop |
1368                            |         |
1369                           tail ------+
1370                            |
1371                         (exit-bb)
1372
1373          The inner-loop has the properties expected of inner-most loops
1374          as described above.  */
1375
1376       if ((loop->inner)->inner || (loop->inner)->next)
1377         return opt_result::failure_at (vect_location,
1378                                        "not vectorized:"
1379                                        " multiple nested loops.\n");
1380
1381       if (loop->num_nodes != 5)
1382         return opt_result::failure_at (vect_location,
1383                                        "not vectorized:"
1384                                        " control flow in loop.\n");
1385
1386       entryedge = loop_preheader_edge (innerloop);
1387       if (entryedge->src != loop->header
1388           || !single_exit (innerloop)
1389           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1390         return opt_result::failure_at (vect_location,
1391                                        "not vectorized:"
1392                                        " unsupported outerloop form.\n");
1393
1394       /* Analyze the inner-loop.  */
1395       vect_loop_form_info inner;
1396       opt_result res = vect_analyze_loop_form (loop->inner, &inner);
1397       if (!res)
1398         {
1399           if (dump_enabled_p ())
1400             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1401                              "not vectorized: Bad inner loop.\n");
1402           return res;
1403         }
1404
1405       /* Don't support analyzing niter under assumptions for inner
1406          loop.  */
1407       if (!integer_onep (inner.assumptions))
1408         return opt_result::failure_at (vect_location,
1409                                        "not vectorized: Bad inner loop.\n");
1410
1411       if (!expr_invariant_in_loop_p (loop, inner.number_of_iterations))
1412         return opt_result::failure_at (vect_location,
1413                                        "not vectorized: inner-loop count not"
1414                                        " invariant.\n");
1415
1416       if (dump_enabled_p ())
1417         dump_printf_loc (MSG_NOTE, vect_location,
1418                          "Considering outer-loop vectorization.\n");
1419       info->inner_loop_cond = inner.loop_cond;
1420     }
1421
1422   if (!single_exit (loop))
1423     return opt_result::failure_at (vect_location,
1424                                    "not vectorized: multiple exits.\n");
1425   if (EDGE_COUNT (loop->header->preds) != 2)
1426     return opt_result::failure_at (vect_location,
1427                                    "not vectorized:"
1428                                    " too many incoming edges.\n");
1429
1430   /* We assume that the loop exit condition is at the end of the loop. i.e,
1431      that the loop is represented as a do-while (with a proper if-guard
1432      before the loop if needed), where the loop header contains all the
1433      executable statements, and the latch is empty.  */
1434   if (!empty_block_p (loop->latch)
1435       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1436     return opt_result::failure_at (vect_location,
1437                                    "not vectorized: latch block not empty.\n");
1438
1439   /* Make sure the exit is not abnormal.  */
1440   edge e = single_exit (loop);
1441   if (e->flags & EDGE_ABNORMAL)
1442     return opt_result::failure_at (vect_location,
1443                                    "not vectorized:"
1444                                    " abnormal loop exit edge.\n");
1445
1446   info->loop_cond
1447     = vect_get_loop_niters (loop, &info->assumptions,
1448                             &info->number_of_iterations,
1449                             &info->number_of_iterationsm1);
1450   if (!info->loop_cond)
1451     return opt_result::failure_at
1452       (vect_location,
1453        "not vectorized: complicated exit condition.\n");
1454
1455   if (integer_zerop (info->assumptions)
1456       || !info->number_of_iterations
1457       || chrec_contains_undetermined (info->number_of_iterations))
1458     return opt_result::failure_at
1459       (info->loop_cond,
1460        "not vectorized: number of iterations cannot be computed.\n");
1461
1462   if (integer_zerop (info->number_of_iterations))
1463     return opt_result::failure_at
1464       (info->loop_cond,
1465        "not vectorized: number of iterations = 0.\n");
1466
1467   return opt_result::success ();
1468 }
1469
1470 /* Create a loop_vec_info for LOOP with SHARED and the
1471    vect_analyze_loop_form result.  */
1472
1473 loop_vec_info
1474 vect_create_loop_vinfo (class loop *loop, vec_info_shared *shared,
1475                         const vect_loop_form_info *info)
1476 {
1477   loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1478   LOOP_VINFO_NITERSM1 (loop_vinfo) = info->number_of_iterationsm1;
1479   LOOP_VINFO_NITERS (loop_vinfo) = info->number_of_iterations;
1480   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = info->number_of_iterations;
1481   if (!integer_onep (info->assumptions))
1482     {
1483       /* We consider to vectorize this loop by versioning it under
1484          some assumptions.  In order to do this, we need to clear
1485          existing information computed by scev and niter analyzer.  */
1486       scev_reset_htab ();
1487       free_numbers_of_iterations_estimates (loop);
1488       /* Also set flag for this loop so that following scev and niter
1489          analysis are done under the assumptions.  */
1490       loop_constraint_set (loop, LOOP_C_FINITE);
1491       /* Also record the assumptions for versioning.  */
1492       LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = info->assumptions;
1493     }
1494
1495   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1496     {
1497       if (dump_enabled_p ())
1498         {
1499           dump_printf_loc (MSG_NOTE, vect_location,
1500                            "Symbolic number of iterations is ");
1501           dump_generic_expr (MSG_NOTE, TDF_DETAILS, info->number_of_iterations);
1502           dump_printf (MSG_NOTE, "\n");
1503         }
1504     }
1505
1506   stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (info->loop_cond);
1507   STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1508   if (info->inner_loop_cond)
1509     {
1510       stmt_vec_info inner_loop_cond_info
1511         = loop_vinfo->lookup_stmt (info->inner_loop_cond);
1512       STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1513       /* If we have an estimate on the number of iterations of the inner
1514          loop use that to limit the scale for costing, otherwise use
1515          --param vect-inner-loop-cost-factor literally.  */
1516       widest_int nit;
1517       if (estimated_stmt_executions (loop->inner, &nit))
1518         LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo)
1519           = wi::smin (nit, param_vect_inner_loop_cost_factor).to_uhwi ();
1520     }
1521
1522   return loop_vinfo;
1523 }
1524
1525
1526
1527 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1528    statements update the vectorization factor.  */
1529
1530 static void
1531 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1532 {
1533   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1534   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1535   int nbbs = loop->num_nodes;
1536   poly_uint64 vectorization_factor;
1537   int i;
1538
1539   DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1540
1541   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1542   gcc_assert (known_ne (vectorization_factor, 0U));
1543
1544   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1545      vectorization factor of the loop is the unrolling factor required by
1546      the SLP instances.  If that unrolling factor is 1, we say, that we
1547      perform pure SLP on loop - cross iteration parallelism is not
1548      exploited.  */
1549   bool only_slp_in_loop = true;
1550   for (i = 0; i < nbbs; i++)
1551     {
1552       basic_block bb = bbs[i];
1553       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1554            gsi_next (&si))
1555         {
1556           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1557           if (!stmt_info)
1558             continue;
1559           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1560                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1561               && !PURE_SLP_STMT (stmt_info))
1562             /* STMT needs both SLP and loop-based vectorization.  */
1563             only_slp_in_loop = false;
1564         }
1565       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1566            gsi_next (&si))
1567         {
1568           if (is_gimple_debug (gsi_stmt (si)))
1569             continue;
1570           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1571           stmt_info = vect_stmt_to_vectorize (stmt_info);
1572           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1573                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1574               && !PURE_SLP_STMT (stmt_info))
1575             /* STMT needs both SLP and loop-based vectorization.  */
1576             only_slp_in_loop = false;
1577         }
1578     }
1579
1580   if (only_slp_in_loop)
1581     {
1582       if (dump_enabled_p ())
1583         dump_printf_loc (MSG_NOTE, vect_location,
1584                          "Loop contains only SLP stmts\n");
1585       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1586     }
1587   else
1588     {
1589       if (dump_enabled_p ())
1590         dump_printf_loc (MSG_NOTE, vect_location,
1591                          "Loop contains SLP and non-SLP stmts\n");
1592       /* Both the vectorization factor and unroll factor have the form
1593          GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
1594          so they must have a common multiple.  */
1595       vectorization_factor
1596         = force_common_multiple (vectorization_factor,
1597                                  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1598     }
1599
1600   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1601   if (dump_enabled_p ())
1602     {
1603       dump_printf_loc (MSG_NOTE, vect_location,
1604                        "Updating vectorization factor to ");
1605       dump_dec (MSG_NOTE, vectorization_factor);
1606       dump_printf (MSG_NOTE, ".\n");
1607     }
1608 }
1609
1610 /* Return true if STMT_INFO describes a double reduction phi and if
1611    the other phi in the reduction is also relevant for vectorization.
1612    This rejects cases such as:
1613
1614       outer1:
1615         x_1 = PHI <x_3(outer2), ...>;
1616         ...
1617
1618       inner:
1619         x_2 = ...;
1620         ...
1621
1622       outer2:
1623         x_3 = PHI <x_2(inner)>;
1624
1625    if nothing in x_2 or elsewhere makes x_1 relevant.  */
1626
1627 static bool
1628 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1629 {
1630   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1631     return false;
1632
1633   return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1634 }
1635
1636 /* Function vect_analyze_loop_operations.
1637
1638    Scan the loop stmts and make sure they are all vectorizable.  */
1639
1640 static opt_result
1641 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1642 {
1643   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1644   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1645   int nbbs = loop->num_nodes;
1646   int i;
1647   stmt_vec_info stmt_info;
1648   bool need_to_vectorize = false;
1649   bool ok;
1650
1651   DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1652
1653   auto_vec<stmt_info_for_cost> cost_vec;
1654
1655   for (i = 0; i < nbbs; i++)
1656     {
1657       basic_block bb = bbs[i];
1658
1659       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1660            gsi_next (&si))
1661         {
1662           gphi *phi = si.phi ();
1663           ok = true;
1664
1665           stmt_info = loop_vinfo->lookup_stmt (phi);
1666           if (dump_enabled_p ())
1667             dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G", phi);
1668           if (virtual_operand_p (gimple_phi_result (phi)))
1669             continue;
1670
1671           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1672              (i.e., a phi in the tail of the outer-loop).  */
1673           if (! is_loop_header_bb_p (bb))
1674             {
1675               /* FORNOW: we currently don't support the case that these phis
1676                  are not used in the outerloop (unless it is double reduction,
1677                  i.e., this phi is vect_reduction_def), cause this case
1678                  requires to actually do something here.  */
1679               if (STMT_VINFO_LIVE_P (stmt_info)
1680                   && !vect_active_double_reduction_p (stmt_info))
1681                 return opt_result::failure_at (phi,
1682                                                "Unsupported loop-closed phi"
1683                                                " in outer-loop.\n");
1684
1685               /* If PHI is used in the outer loop, we check that its operand
1686                  is defined in the inner loop.  */
1687               if (STMT_VINFO_RELEVANT_P (stmt_info))
1688                 {
1689                   tree phi_op;
1690
1691                   if (gimple_phi_num_args (phi) != 1)
1692                     return opt_result::failure_at (phi, "unsupported phi");
1693
1694                   phi_op = PHI_ARG_DEF (phi, 0);
1695                   stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1696                   if (!op_def_info)
1697                     return opt_result::failure_at (phi, "unsupported phi\n");
1698
1699                   if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1700                       && (STMT_VINFO_RELEVANT (op_def_info)
1701                           != vect_used_in_outer_by_reduction))
1702                     return opt_result::failure_at (phi, "unsupported phi\n");
1703
1704                   if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
1705                        || (STMT_VINFO_DEF_TYPE (stmt_info)
1706                            == vect_double_reduction_def))
1707                       && !vectorizable_lc_phi (loop_vinfo,
1708                                                stmt_info, NULL, NULL))
1709                     return opt_result::failure_at (phi, "unsupported phi\n");
1710                 }
1711
1712               continue;
1713             }
1714
1715           gcc_assert (stmt_info);
1716
1717           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1718                || STMT_VINFO_LIVE_P (stmt_info))
1719               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1720             /* A scalar-dependence cycle that we don't support.  */
1721             return opt_result::failure_at (phi,
1722                                            "not vectorized:"
1723                                            " scalar dependence cycle.\n");
1724
1725           if (STMT_VINFO_RELEVANT_P (stmt_info))
1726             {
1727               need_to_vectorize = true;
1728               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1729                   && ! PURE_SLP_STMT (stmt_info))
1730                 ok = vectorizable_induction (loop_vinfo,
1731                                              stmt_info, NULL, NULL,
1732                                              &cost_vec);
1733               else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1734                         || (STMT_VINFO_DEF_TYPE (stmt_info)
1735                             == vect_double_reduction_def)
1736                         || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1737                        && ! PURE_SLP_STMT (stmt_info))
1738                 ok = vectorizable_reduction (loop_vinfo,
1739                                              stmt_info, NULL, NULL, &cost_vec);
1740             }
1741
1742           /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
1743           if (ok
1744               && STMT_VINFO_LIVE_P (stmt_info)
1745               && !PURE_SLP_STMT (stmt_info))
1746             ok = vectorizable_live_operation (loop_vinfo,
1747                                               stmt_info, NULL, NULL, NULL,
1748                                               -1, false, &cost_vec);
1749
1750           if (!ok)
1751             return opt_result::failure_at (phi,
1752                                            "not vectorized: relevant phi not "
1753                                            "supported: %G",
1754                                            static_cast <gimple *> (phi));
1755         }
1756
1757       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1758            gsi_next (&si))
1759         {
1760           gimple *stmt = gsi_stmt (si);
1761           if (!gimple_clobber_p (stmt)
1762               && !is_gimple_debug (stmt))
1763             {
1764               opt_result res
1765                 = vect_analyze_stmt (loop_vinfo,
1766                                      loop_vinfo->lookup_stmt (stmt),
1767                                      &need_to_vectorize,
1768                                      NULL, NULL, &cost_vec);
1769               if (!res)
1770                 return res;
1771             }
1772         }
1773     } /* bbs */
1774
1775   add_stmt_costs (loop_vinfo->target_cost_data, &cost_vec);
1776
1777   /* All operations in the loop are either irrelevant (deal with loop
1778      control, or dead), or only used outside the loop and can be moved
1779      out of the loop (e.g. invariants, inductions).  The loop can be
1780      optimized away by scalar optimizations.  We're better off not
1781      touching this loop.  */
1782   if (!need_to_vectorize)
1783     {
1784       if (dump_enabled_p ())
1785         dump_printf_loc (MSG_NOTE, vect_location,
1786                          "All the computation can be taken out of the loop.\n");
1787       return opt_result::failure_at
1788         (vect_location,
1789          "not vectorized: redundant loop. no profit to vectorize.\n");
1790     }
1791
1792   return opt_result::success ();
1793 }
1794
1795 /* Return true if we know that the iteration count is smaller than the
1796    vectorization factor.  Return false if it isn't, or if we can't be sure
1797    either way.  */
1798
1799 static bool
1800 vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
1801 {
1802   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1803
1804   HOST_WIDE_INT max_niter;
1805   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1806     max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1807   else
1808     max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1809
1810   if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1811     return true;
1812
1813   return false;
1814 }
1815
1816 /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
1817    is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
1818    definitely no, or -1 if it's worth retrying.  */
1819
1820 static int
1821 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1822 {
1823   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1824   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1825
1826   /* Only loops that can handle partially-populated vectors can have iteration
1827      counts less than the vectorization factor.  */
1828   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
1829     {
1830       if (vect_known_niters_smaller_than_vf (loop_vinfo))
1831         {
1832           if (dump_enabled_p ())
1833             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1834                              "not vectorized: iteration count smaller than "
1835                              "vectorization factor.\n");
1836           return 0;
1837         }
1838     }
1839
1840   /* If using the "very cheap" model. reject cases in which we'd keep
1841      a copy of the scalar code (even if we might be able to vectorize it).  */
1842   if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
1843       && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1844           || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1845           || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)))
1846     {
1847       if (dump_enabled_p ())
1848         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1849                          "some scalar iterations would need to be peeled\n");
1850       return 0;
1851     }
1852
1853   int min_profitable_iters, min_profitable_estimate;
1854   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1855                                       &min_profitable_estimate);
1856
1857   if (min_profitable_iters < 0)
1858     {
1859       if (dump_enabled_p ())
1860         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1861                          "not vectorized: vectorization not profitable.\n");
1862       if (dump_enabled_p ())
1863         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1864                          "not vectorized: vector version will never be "
1865                          "profitable.\n");
1866       return -1;
1867     }
1868
1869   int min_scalar_loop_bound = (param_min_vect_loop_bound
1870                                * assumed_vf);
1871
1872   /* Use the cost model only if it is more conservative than user specified
1873      threshold.  */
1874   unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1875                                     min_profitable_iters);
1876
1877   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1878
1879   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1880       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1881     {
1882       if (dump_enabled_p ())
1883         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1884                          "not vectorized: vectorization not profitable.\n");
1885       if (dump_enabled_p ())
1886         dump_printf_loc (MSG_NOTE, vect_location,
1887                          "not vectorized: iteration count smaller than user "
1888                          "specified loop bound parameter or minimum profitable "
1889                          "iterations (whichever is more conservative).\n");
1890       return 0;
1891     }
1892
1893   /* The static profitablity threshold min_profitable_estimate includes
1894      the cost of having to check at runtime whether the scalar loop
1895      should be used instead.  If it turns out that we don't need or want
1896      such a check, the threshold we should use for the static estimate
1897      is simply the point at which the vector loop becomes more profitable
1898      than the scalar loop.  */
1899   if (min_profitable_estimate > min_profitable_iters
1900       && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
1901       && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
1902       && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1903       && !vect_apply_runtime_profitability_check_p (loop_vinfo))
1904     {
1905       if (dump_enabled_p ())
1906         dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
1907                          " choice between the scalar and vector loops\n");
1908       min_profitable_estimate = min_profitable_iters;
1909     }
1910
1911   /* If the vector loop needs multiple iterations to be beneficial then
1912      things are probably too close to call, and the conservative thing
1913      would be to stick with the scalar code.  */
1914   if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
1915       && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
1916     {
1917       if (dump_enabled_p ())
1918         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1919                          "one iteration of the vector loop would be"
1920                          " more expensive than the equivalent number of"
1921                          " iterations of the scalar loop\n");
1922       return 0;
1923     }
1924
1925   HOST_WIDE_INT estimated_niter;
1926
1927   /* If we are vectorizing an epilogue then we know the maximum number of
1928      scalar iterations it will cover is at least one lower than the
1929      vectorization factor of the main loop.  */
1930   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1931     estimated_niter
1932       = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
1933   else
1934     {
1935       estimated_niter = estimated_stmt_executions_int (loop);
1936       if (estimated_niter == -1)
1937         estimated_niter = likely_max_stmt_executions_int (loop);
1938     }
1939   if (estimated_niter != -1
1940       && ((unsigned HOST_WIDE_INT) estimated_niter
1941           < MAX (th, (unsigned) min_profitable_estimate)))
1942     {
1943       if (dump_enabled_p ())
1944         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1945                          "not vectorized: estimated iteration count too "
1946                          "small.\n");
1947       if (dump_enabled_p ())
1948         dump_printf_loc (MSG_NOTE, vect_location,
1949                          "not vectorized: estimated iteration count smaller "
1950                          "than specified loop bound parameter or minimum "
1951                          "profitable iterations (whichever is more "
1952                          "conservative).\n");
1953       return -1;
1954     }
1955
1956   return 1;
1957 }
1958
1959 static opt_result
1960 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1961                            vec<data_reference_p> *datarefs,
1962                            unsigned int *n_stmts)
1963 {
1964   *n_stmts = 0;
1965   for (unsigned i = 0; i < loop->num_nodes; i++)
1966     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1967          !gsi_end_p (gsi); gsi_next (&gsi))
1968       {
1969         gimple *stmt = gsi_stmt (gsi);
1970         if (is_gimple_debug (stmt))
1971           continue;
1972         ++(*n_stmts);
1973         opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
1974                                                         NULL, 0);
1975         if (!res)
1976           {
1977             if (is_gimple_call (stmt) && loop->safelen)
1978               {
1979                 tree fndecl = gimple_call_fndecl (stmt), op;
1980                 if (fndecl != NULL_TREE)
1981                   {
1982                     cgraph_node *node = cgraph_node::get (fndecl);
1983                     if (node != NULL && node->simd_clones != NULL)
1984                       {
1985                         unsigned int j, n = gimple_call_num_args (stmt);
1986                         for (j = 0; j < n; j++)
1987                           {
1988                             op = gimple_call_arg (stmt, j);
1989                             if (DECL_P (op)
1990                                 || (REFERENCE_CLASS_P (op)
1991                                     && get_base_address (op)))
1992                               break;
1993                           }
1994                         op = gimple_call_lhs (stmt);
1995                         /* Ignore #pragma omp declare simd functions
1996                            if they don't have data references in the
1997                            call stmt itself.  */
1998                         if (j == n
1999                             && !(op
2000                                  && (DECL_P (op)
2001                                      || (REFERENCE_CLASS_P (op)
2002                                          && get_base_address (op)))))
2003                           continue;
2004                       }
2005                   }
2006               }
2007             return res;
2008           }
2009         /* If dependence analysis will give up due to the limit on the
2010            number of datarefs stop here and fail fatally.  */
2011         if (datarefs->length ()
2012             > (unsigned)param_loop_max_datarefs_for_datadeps)
2013           return opt_result::failure_at (stmt, "exceeded param "
2014                                          "loop-max-datarefs-for-datadeps\n");
2015       }
2016   return opt_result::success ();
2017 }
2018
2019 /* Look for SLP-only access groups and turn each individual access into its own
2020    group.  */
2021 static void
2022 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
2023 {
2024   unsigned int i;
2025   struct data_reference *dr;
2026
2027   DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
2028
2029   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2030   FOR_EACH_VEC_ELT (datarefs, i, dr)
2031     {
2032       gcc_assert (DR_REF (dr));
2033       stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
2034
2035       /* Check if the load is a part of an interleaving chain.  */
2036       if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2037         {
2038           stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
2039           dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_element);
2040           unsigned int group_size = DR_GROUP_SIZE (first_element);
2041
2042           /* Check if SLP-only groups.  */
2043           if (!STMT_SLP_TYPE (stmt_info)
2044               && STMT_VINFO_SLP_VECT_ONLY (first_element))
2045             {
2046               /* Dissolve the group.  */
2047               STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
2048
2049               stmt_vec_info vinfo = first_element;
2050               while (vinfo)
2051                 {
2052                   stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
2053                   DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
2054                   DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2055                   DR_GROUP_SIZE (vinfo) = 1;
2056                   if (STMT_VINFO_STRIDED_P (first_element))
2057                     DR_GROUP_GAP (vinfo) = 0;
2058                   else
2059                     DR_GROUP_GAP (vinfo) = group_size - 1;
2060                   /* Duplicate and adjust alignment info, it needs to
2061                      be present on each group leader, see dr_misalignment.  */
2062                   if (vinfo != first_element)
2063                     {
2064                       dr_vec_info *dr_info2 = STMT_VINFO_DR_INFO (vinfo);
2065                       dr_info2->target_alignment = dr_info->target_alignment;
2066                       int misalignment = dr_info->misalignment;
2067                       if (misalignment != DR_MISALIGNMENT_UNKNOWN)
2068                         {
2069                           HOST_WIDE_INT diff
2070                             = (TREE_INT_CST_LOW (DR_INIT (dr_info2->dr))
2071                                - TREE_INT_CST_LOW (DR_INIT (dr_info->dr)));
2072                           unsigned HOST_WIDE_INT align_c
2073                             = dr_info->target_alignment.to_constant ();
2074                           misalignment = (misalignment + diff) % align_c;
2075                         }
2076                       dr_info2->misalignment = misalignment;
2077                     }
2078                   vinfo = next;
2079                 }
2080             }
2081         }
2082     }
2083 }
2084
2085 /* Determine if operating on full vectors for LOOP_VINFO might leave
2086    some scalar iterations still to do.  If so, decide how we should
2087    handle those scalar iterations.  The possibilities are:
2088
2089    (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
2090        In this case:
2091
2092          LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2093          LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2094          LOOP_VINFO_PEELING_FOR_NITER == false
2095
2096    (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2097        to handle the remaining scalar iterations.  In this case:
2098
2099          LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2100          LOOP_VINFO_PEELING_FOR_NITER == true
2101
2102        There are two choices:
2103
2104        (2a) Consider vectorizing the epilogue loop at the same VF as the
2105             main loop, but using partial vectors instead of full vectors.
2106             In this case:
2107
2108               LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true
2109
2110        (2b) Consider vectorizing the epilogue loop at lower VFs only.
2111             In this case:
2112
2113               LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2114
2115    When FOR_EPILOGUE_P is true, make this determination based on the
2116    assumption that LOOP_VINFO is an epilogue loop, otherwise make it
2117    based on the assumption that LOOP_VINFO is the main loop.  The caller
2118    has made sure that the number of iterations is set appropriately for
2119    this value of FOR_EPILOGUE_P.  */
2120
2121 opt_result
2122 vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo,
2123                                             bool for_epilogue_p)
2124 {
2125   /* Determine whether there would be any scalar iterations left over.  */
2126   bool need_peeling_or_partial_vectors_p
2127     = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
2128
2129   /* Decide whether to vectorize the loop with partial vectors.  */
2130   LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2131   LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2132   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2133       && need_peeling_or_partial_vectors_p)
2134     {
2135       /* For partial-vector-usage=1, try to push the handling of partial
2136          vectors to the epilogue, with the main loop continuing to operate
2137          on full vectors.
2138
2139          ??? We could then end up failing to use partial vectors if we
2140          decide to peel iterations into a prologue, and if the main loop
2141          then ends up processing fewer than VF iterations.  */
2142       if (param_vect_partial_vector_usage == 1
2143           && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2144           && !vect_known_niters_smaller_than_vf (loop_vinfo))
2145         LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2146       else
2147         LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2148     }
2149
2150   if (dump_enabled_p ())
2151     {
2152       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2153         dump_printf_loc (MSG_NOTE, vect_location,
2154                          "operating on partial vectors%s.\n",
2155                          for_epilogue_p ? " for epilogue loop" : "");
2156       else
2157         dump_printf_loc (MSG_NOTE, vect_location,
2158                          "operating only on full vectors%s.\n",
2159                          for_epilogue_p ? " for epilogue loop" : "");
2160     }
2161
2162   if (for_epilogue_p)
2163     {
2164       loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2165       gcc_assert (orig_loop_vinfo);
2166       if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2167         gcc_assert (known_lt (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2168                               LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)));
2169     }
2170
2171   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2172       && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2173     {
2174       /* Check that the loop processes at least one full vector.  */
2175       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2176       tree scalar_niters = LOOP_VINFO_NITERS (loop_vinfo);
2177       if (known_lt (wi::to_widest (scalar_niters), vf))
2178         return opt_result::failure_at (vect_location,
2179                                        "loop does not have enough iterations"
2180                                        " to support vectorization.\n");
2181
2182       /* If we need to peel an extra epilogue iteration to handle data
2183          accesses with gaps, check that there are enough scalar iterations
2184          available.
2185
2186          The check above is redundant with this one when peeling for gaps,
2187          but the distinction is useful for diagnostics.  */
2188       tree scalar_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
2189       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2190           && known_lt (wi::to_widest (scalar_nitersm1), vf))
2191         return opt_result::failure_at (vect_location,
2192                                        "loop does not have enough iterations"
2193                                        " to support peeling for gaps.\n");
2194     }
2195
2196   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2197     = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2198        && need_peeling_or_partial_vectors_p);
2199
2200   return opt_result::success ();
2201 }
2202
2203 /* Function vect_analyze_loop_2.
2204
2205    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2206    for it.  The different analyses will record information in the
2207    loop_vec_info struct.  */
2208 static opt_result
2209 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal)
2210 {
2211   opt_result ok = opt_result::success ();
2212   int res;
2213   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2214   poly_uint64 min_vf = 2;
2215   loop_vec_info orig_loop_vinfo = NULL;
2216
2217   /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2218      loop_vec_info of the first vectorized loop.  */
2219   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2220     orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2221   else
2222     orig_loop_vinfo = loop_vinfo;
2223   gcc_assert (orig_loop_vinfo);
2224
2225   /* The first group of checks is independent of the vector size.  */
2226   fatal = true;
2227
2228   if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2229       && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2230     return opt_result::failure_at (vect_location,
2231                                    "not vectorized: simd if(0)\n");
2232
2233   /* Find all data references in the loop (which correspond to vdefs/vuses)
2234      and analyze their evolution in the loop.  */
2235
2236   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2237
2238   /* Gather the data references and count stmts in the loop.  */
2239   if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2240     {
2241       opt_result res
2242         = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2243                                      &LOOP_VINFO_DATAREFS (loop_vinfo),
2244                                      &LOOP_VINFO_N_STMTS (loop_vinfo));
2245       if (!res)
2246         {
2247           if (dump_enabled_p ())
2248             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2249                              "not vectorized: loop contains function "
2250                              "calls or data references that cannot "
2251                              "be analyzed\n");
2252           return res;
2253         }
2254       loop_vinfo->shared->save_datarefs ();
2255     }
2256   else
2257     loop_vinfo->shared->check_datarefs ();
2258
2259   /* Analyze the data references and also adjust the minimal
2260      vectorization factor according to the loads and stores.  */
2261
2262   ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
2263   if (!ok)
2264     {
2265       if (dump_enabled_p ())
2266         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2267                          "bad data references.\n");
2268       return ok;
2269     }
2270
2271   /* Classify all cross-iteration scalar data-flow cycles.
2272      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
2273   vect_analyze_scalar_cycles (loop_vinfo);
2274
2275   vect_pattern_recog (loop_vinfo);
2276
2277   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2278
2279   /* Analyze the access patterns of the data-refs in the loop (consecutive,
2280      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
2281
2282   ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
2283   if (!ok)
2284     {
2285       if (dump_enabled_p ())
2286         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2287                          "bad data access.\n");
2288       return ok;
2289     }
2290
2291   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
2292
2293   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2294   if (!ok)
2295     {
2296       if (dump_enabled_p ())
2297         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2298                          "unexpected pattern.\n");
2299       return ok;
2300     }
2301
2302   /* While the rest of the analysis below depends on it in some way.  */
2303   fatal = false;
2304
2305   /* Analyze data dependences between the data-refs in the loop
2306      and adjust the maximum vectorization factor according to
2307      the dependences.
2308      FORNOW: fail at the first data dependence that we encounter.  */
2309
2310   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2311   if (!ok)
2312     {
2313       if (dump_enabled_p ())
2314         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2315                          "bad data dependence.\n");
2316       return ok;
2317     }
2318   if (max_vf != MAX_VECTORIZATION_FACTOR
2319       && maybe_lt (max_vf, min_vf))
2320     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2321   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2322
2323   ok = vect_determine_vectorization_factor (loop_vinfo);
2324   if (!ok)
2325     {
2326       if (dump_enabled_p ())
2327         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2328                          "can't determine vectorization factor.\n");
2329       return ok;
2330     }
2331   if (max_vf != MAX_VECTORIZATION_FACTOR
2332       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2333     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2334
2335   /* Compute the scalar iteration cost.  */
2336   vect_compute_single_scalar_iteration_cost (loop_vinfo);
2337
2338   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2339
2340   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
2341   ok = vect_analyze_slp (loop_vinfo, LOOP_VINFO_N_STMTS (loop_vinfo));
2342   if (!ok)
2343     return ok;
2344
2345   /* If there are any SLP instances mark them as pure_slp.  */
2346   bool slp = vect_make_slp_decision (loop_vinfo);
2347   if (slp)
2348     {
2349       /* Find stmts that need to be both vectorized and SLPed.  */
2350       vect_detect_hybrid_slp (loop_vinfo);
2351
2352       /* Update the vectorization factor based on the SLP decision.  */
2353       vect_update_vf_for_slp (loop_vinfo);
2354
2355       /* Optimize the SLP graph with the vectorization factor fixed.  */
2356       vect_optimize_slp (loop_vinfo);
2357
2358       /* Gather the loads reachable from the SLP graph entries.  */
2359       vect_gather_slp_loads (loop_vinfo);
2360     }
2361
2362   bool saved_can_use_partial_vectors_p
2363     = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2364
2365   /* We don't expect to have to roll back to anything other than an empty
2366      set of rgroups.  */
2367   gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2368
2369   /* This is the point where we can re-start analysis with SLP forced off.  */
2370 start_over:
2371
2372   /* Now the vectorization factor is final.  */
2373   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2374   gcc_assert (known_ne (vectorization_factor, 0U));
2375
2376   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2377     {
2378       dump_printf_loc (MSG_NOTE, vect_location,
2379                        "vectorization_factor = ");
2380       dump_dec (MSG_NOTE, vectorization_factor);
2381       dump_printf (MSG_NOTE, ", niters = %wd\n",
2382                    LOOP_VINFO_INT_NITERS (loop_vinfo));
2383     }
2384
2385   LOOP_VINFO_TARGET_COST_DATA (loop_vinfo) = init_cost (loop_vinfo, false);
2386
2387   /* Analyze the alignment of the data-refs in the loop.
2388      Fail if a data reference is found that cannot be vectorized.  */
2389
2390   ok = vect_analyze_data_refs_alignment (loop_vinfo);
2391   if (!ok)
2392     {
2393       if (dump_enabled_p ())
2394         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2395                          "bad data alignment.\n");
2396       return ok;
2397     }
2398
2399   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2400      It is important to call pruning after vect_analyze_data_ref_accesses,
2401      since we use grouping information gathered by interleaving analysis.  */
2402   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2403   if (!ok)
2404     return ok;
2405
2406   /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2407      vectorization, since we do not want to add extra peeling or
2408      add versioning for alignment.  */
2409   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2410     /* This pass will decide on using loop versioning and/or loop peeling in
2411        order to enhance the alignment of data references in the loop.  */
2412     ok = vect_enhance_data_refs_alignment (loop_vinfo);
2413   if (!ok)
2414     return ok;
2415
2416   if (slp)
2417     {
2418       /* Analyze operations in the SLP instances.  Note this may
2419          remove unsupported SLP instances which makes the above
2420          SLP kind detection invalid.  */
2421       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2422       vect_slp_analyze_operations (loop_vinfo);
2423       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2424         {
2425           ok = opt_result::failure_at (vect_location,
2426                                        "unsupported SLP instances\n");
2427           goto again;
2428         }
2429
2430       /* Check whether any load in ALL SLP instances is possibly permuted.  */
2431       slp_tree load_node, slp_root;
2432       unsigned i, x;
2433       slp_instance instance;
2434       bool can_use_lanes = true;
2435       FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), x, instance)
2436         {
2437           slp_root = SLP_INSTANCE_TREE (instance);
2438           int group_size = SLP_TREE_LANES (slp_root);
2439           tree vectype = SLP_TREE_VECTYPE (slp_root);
2440           bool loads_permuted = false;
2441           FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2442             {
2443               if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
2444                 continue;
2445               unsigned j;
2446               stmt_vec_info load_info;
2447               FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2448                 if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
2449                   {
2450                     loads_permuted = true;
2451                     break;
2452                   }
2453             }
2454
2455           /* If the loads and stores can be handled with load/store-lane
2456              instructions record it and move on to the next instance.  */
2457           if (loads_permuted
2458               && SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2459               && vect_store_lanes_supported (vectype, group_size, false))
2460             {
2461               FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2462                 {
2463                   stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2464                       (SLP_TREE_SCALAR_STMTS (load_node)[0]);
2465                   /* Use SLP for strided accesses (or if we can't
2466                      load-lanes).  */
2467                   if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2468                       || ! vect_load_lanes_supported
2469                             (STMT_VINFO_VECTYPE (stmt_vinfo),
2470                              DR_GROUP_SIZE (stmt_vinfo), false))
2471                     break;
2472                 }
2473
2474               can_use_lanes
2475                 = can_use_lanes && i == SLP_INSTANCE_LOADS (instance).length ();
2476
2477               if (can_use_lanes && dump_enabled_p ())
2478                 dump_printf_loc (MSG_NOTE, vect_location,
2479                                  "SLP instance %p can use load/store-lanes\n",
2480                                  instance);
2481             }
2482           else
2483             {
2484               can_use_lanes = false;
2485               break;
2486             }
2487         }
2488
2489       /* If all SLP instances can use load/store-lanes abort SLP and try again
2490          with SLP disabled.  */
2491       if (can_use_lanes)
2492         {
2493           ok = opt_result::failure_at (vect_location,
2494                                        "Built SLP cancelled: can use "
2495                                        "load/store-lanes\n");
2496           if (dump_enabled_p ())
2497             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2498                              "Built SLP cancelled: all SLP instances support "
2499                              "load/store-lanes\n");
2500           goto again;
2501         }
2502     }
2503
2504   /* Dissolve SLP-only groups.  */
2505   vect_dissolve_slp_only_groups (loop_vinfo);
2506
2507   /* Scan all the remaining operations in the loop that are not subject
2508      to SLP and make sure they are vectorizable.  */
2509   ok = vect_analyze_loop_operations (loop_vinfo);
2510   if (!ok)
2511     {
2512       if (dump_enabled_p ())
2513         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2514                          "bad operation or unsupported loop bound.\n");
2515       return ok;
2516     }
2517
2518   /* For now, we don't expect to mix both masking and length approaches for one
2519      loop, disable it if both are recorded.  */
2520   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2521       && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
2522       && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
2523     {
2524       if (dump_enabled_p ())
2525         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2526                          "can't vectorize a loop with partial vectors"
2527                          " because we don't expect to mix different"
2528                          " approaches with partial vectors for the"
2529                          " same loop.\n");
2530       LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2531     }
2532
2533   /* If we still have the option of using partial vectors,
2534      check whether we can generate the necessary loop controls.  */
2535   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2536       && !vect_verify_full_masking (loop_vinfo)
2537       && !vect_verify_loop_lens (loop_vinfo))
2538     LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2539
2540   /* If we're vectorizing an epilogue loop, the vectorized loop either needs
2541      to be able to handle fewer than VF scalars, or needs to have a lower VF
2542      than the main loop.  */
2543   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2544       && !LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2545       && maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2546                    LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)))
2547     return opt_result::failure_at (vect_location,
2548                                    "Vectorization factor too high for"
2549                                    " epilogue loop.\n");
2550
2551   /* Decide whether this loop_vinfo should use partial vectors or peeling,
2552      assuming that the loop will be used as a main loop.  We will redo
2553      this analysis later if we instead decide to use the loop as an
2554      epilogue loop.  */
2555   ok = vect_determine_partial_vectors_and_peeling (loop_vinfo, false);
2556   if (!ok)
2557     return ok;
2558
2559   /* Check the costings of the loop make vectorizing worthwhile.  */
2560   res = vect_analyze_loop_costing (loop_vinfo);
2561   if (res < 0)
2562     {
2563       ok = opt_result::failure_at (vect_location,
2564                                    "Loop costings may not be worthwhile.\n");
2565       goto again;
2566     }
2567   if (!res)
2568     return opt_result::failure_at (vect_location,
2569                                    "Loop costings not worthwhile.\n");
2570
2571   /* If an epilogue loop is required make sure we can create one.  */
2572   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2573       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2574     {
2575       if (dump_enabled_p ())
2576         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2577       if (!vect_can_advance_ivs_p (loop_vinfo)
2578           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2579                                            single_exit (LOOP_VINFO_LOOP
2580                                                          (loop_vinfo))))
2581         {
2582           ok = opt_result::failure_at (vect_location,
2583                                        "not vectorized: can't create required "
2584                                        "epilog loop\n");
2585           goto again;
2586         }
2587     }
2588
2589   /* During peeling, we need to check if number of loop iterations is
2590      enough for both peeled prolog loop and vector loop.  This check
2591      can be merged along with threshold check of loop versioning, so
2592      increase threshold for this case if necessary.
2593
2594      If we are analyzing an epilogue we still want to check what its
2595      versioning threshold would be.  If we decide to vectorize the epilogues we
2596      will want to use the lowest versioning threshold of all epilogues and main
2597      loop.  This will enable us to enter a vectorized epilogue even when
2598      versioning the loop.  We can't simply check whether the epilogue requires
2599      versioning though since we may have skipped some versioning checks when
2600      analyzing the epilogue.  For instance, checks for alias versioning will be
2601      skipped when dealing with epilogues as we assume we already checked them
2602      for the main loop.  So instead we always check the 'orig_loop_vinfo'.  */
2603   if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
2604     {
2605       poly_uint64 niters_th = 0;
2606       unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2607
2608       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2609         {
2610           /* Niters for peeled prolog loop.  */
2611           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2612             {
2613               dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2614               tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2615               niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2616             }
2617           else
2618             niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2619         }
2620
2621       /* Niters for at least one iteration of vectorized loop.  */
2622       if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2623         niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2624       /* One additional iteration because of peeling for gap.  */
2625       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2626         niters_th += 1;
2627
2628       /*  Use the same condition as vect_transform_loop to decide when to use
2629           the cost to determine a versioning threshold.  */
2630       if (vect_apply_runtime_profitability_check_p (loop_vinfo)
2631           && ordered_p (th, niters_th))
2632         niters_th = ordered_max (poly_uint64 (th), niters_th);
2633
2634       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2635     }
2636
2637   gcc_assert (known_eq (vectorization_factor,
2638                         LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2639
2640   /* Ok to vectorize!  */
2641   LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2642   return opt_result::success ();
2643
2644 again:
2645   /* Ensure that "ok" is false (with an opt_problem if dumping is enabled).  */
2646   gcc_assert (!ok);
2647
2648   /* Try again with SLP forced off but if we didn't do any SLP there is
2649      no point in re-trying.  */
2650   if (!slp)
2651     return ok;
2652
2653   /* If there are reduction chains re-trying will fail anyway.  */
2654   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2655     return ok;
2656
2657   /* Likewise if the grouped loads or stores in the SLP cannot be handled
2658      via interleaving or lane instructions.  */
2659   slp_instance instance;
2660   slp_tree node;
2661   unsigned i, j;
2662   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2663     {
2664       stmt_vec_info vinfo;
2665       vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2666       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2667         continue;
2668       vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2669       unsigned int size = DR_GROUP_SIZE (vinfo);
2670       tree vectype = STMT_VINFO_VECTYPE (vinfo);
2671       if (! vect_store_lanes_supported (vectype, size, false)
2672          && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2673          && ! vect_grouped_store_supported (vectype, size))
2674         return opt_result::failure_at (vinfo->stmt,
2675                                        "unsupported grouped store\n");
2676       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2677         {
2678           vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
2679           vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2680           bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2681           size = DR_GROUP_SIZE (vinfo);
2682           vectype = STMT_VINFO_VECTYPE (vinfo);
2683           if (! vect_load_lanes_supported (vectype, size, false)
2684               && ! vect_grouped_load_supported (vectype, single_element_p,
2685                                                 size))
2686             return opt_result::failure_at (vinfo->stmt,
2687                                            "unsupported grouped load\n");
2688         }
2689     }
2690
2691   if (dump_enabled_p ())
2692     dump_printf_loc (MSG_NOTE, vect_location,
2693                      "re-trying with SLP disabled\n");
2694
2695   /* Roll back state appropriately.  No SLP this time.  */
2696   slp = false;
2697   /* Restore vectorization factor as it were without SLP.  */
2698   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2699   /* Free the SLP instances.  */
2700   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2701     vect_free_slp_instance (instance);
2702   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2703   /* Reset SLP type to loop_vect on all stmts.  */
2704   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2705     {
2706       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2707       for (gimple_stmt_iterator si = gsi_start_phis (bb);
2708            !gsi_end_p (si); gsi_next (&si))
2709         {
2710           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2711           STMT_SLP_TYPE (stmt_info) = loop_vect;
2712           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2713               || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
2714             {
2715               /* vectorizable_reduction adjusts reduction stmt def-types,
2716                  restore them to that of the PHI.  */
2717               STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
2718                 = STMT_VINFO_DEF_TYPE (stmt_info);
2719               STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
2720                                         (STMT_VINFO_REDUC_DEF (stmt_info)))
2721                 = STMT_VINFO_DEF_TYPE (stmt_info);
2722             }
2723         }
2724       for (gimple_stmt_iterator si = gsi_start_bb (bb);
2725            !gsi_end_p (si); gsi_next (&si))
2726         {
2727           if (is_gimple_debug (gsi_stmt (si)))
2728             continue;
2729           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2730           STMT_SLP_TYPE (stmt_info) = loop_vect;
2731           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2732             {
2733               stmt_vec_info pattern_stmt_info
2734                 = STMT_VINFO_RELATED_STMT (stmt_info);
2735               if (STMT_VINFO_SLP_VECT_ONLY_PATTERN (pattern_stmt_info))
2736                 STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
2737
2738               gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2739               STMT_SLP_TYPE (pattern_stmt_info) = loop_vect;
2740               for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2741                    !gsi_end_p (pi); gsi_next (&pi))
2742                 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2743                   = loop_vect;
2744             }
2745         }
2746     }
2747   /* Free optimized alias test DDRS.  */
2748   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2749   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2750   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2751   /* Reset target cost data.  */
2752   delete LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2753   LOOP_VINFO_TARGET_COST_DATA (loop_vinfo) = nullptr;
2754   /* Reset accumulated rgroup information.  */
2755   release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo));
2756   release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
2757   /* Reset assorted flags.  */
2758   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2759   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2760   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2761   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2762   LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2763     = saved_can_use_partial_vectors_p;
2764
2765   goto start_over;
2766 }
2767
2768 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
2769    to be better than vectorizing it using OLD_LOOP_VINFO.  Assume that
2770    OLD_LOOP_VINFO is better unless something specifically indicates
2771    otherwise.
2772
2773    Note that this deliberately isn't a partial order.  */
2774
2775 static bool
2776 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
2777                           loop_vec_info old_loop_vinfo)
2778 {
2779   struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
2780   gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
2781
2782   poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
2783   poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
2784
2785   /* Always prefer a VF of loop->simdlen over any other VF.  */
2786   if (loop->simdlen)
2787     {
2788       bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
2789       bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
2790       if (new_simdlen_p != old_simdlen_p)
2791         return new_simdlen_p;
2792     }
2793
2794   /* Limit the VFs to what is likely to be the maximum number of iterations,
2795      to handle cases in which at least one loop_vinfo is fully-masked.  */
2796   HOST_WIDE_INT estimated_max_niter;
2797   loop_vec_info main_loop = LOOP_VINFO_ORIG_LOOP_INFO (old_loop_vinfo);
2798   unsigned HOST_WIDE_INT main_vf;
2799   if (main_loop
2800       && LOOP_VINFO_NITERS_KNOWN_P (main_loop)
2801       && LOOP_VINFO_VECT_FACTOR (main_loop).is_constant (&main_vf))
2802     estimated_max_niter = LOOP_VINFO_INT_NITERS (main_loop) % main_vf;
2803   else
2804     estimated_max_niter = likely_max_stmt_executions_int (loop);
2805   if (estimated_max_niter != -1)
2806     {
2807       if (known_le (estimated_max_niter, new_vf))
2808         new_vf = estimated_max_niter;
2809       if (known_le (estimated_max_niter, old_vf))
2810         old_vf = estimated_max_niter;
2811     }
2812
2813   /* Check whether the (fractional) cost per scalar iteration is lower
2814      or higher: new_inside_cost / new_vf vs. old_inside_cost / old_vf.  */
2815   poly_int64 rel_new = new_loop_vinfo->vec_inside_cost * old_vf;
2816   poly_int64 rel_old = old_loop_vinfo->vec_inside_cost * new_vf;
2817
2818   HOST_WIDE_INT est_rel_new_min
2819     = estimated_poly_value (rel_new, POLY_VALUE_MIN);
2820   HOST_WIDE_INT est_rel_new_max
2821     = estimated_poly_value (rel_new, POLY_VALUE_MAX);
2822
2823   HOST_WIDE_INT est_rel_old_min
2824     = estimated_poly_value (rel_old, POLY_VALUE_MIN);
2825   HOST_WIDE_INT est_rel_old_max
2826     = estimated_poly_value (rel_old, POLY_VALUE_MAX);
2827
2828   /* Check first if we can make out an unambigous total order from the minimum
2829      and maximum estimates.  */
2830   if (est_rel_new_min < est_rel_old_min
2831       && est_rel_new_max < est_rel_old_max)
2832     return true;
2833   else if (est_rel_old_min < est_rel_new_min
2834            && est_rel_old_max < est_rel_new_max)
2835     return false;
2836   /* When old_loop_vinfo uses a variable vectorization factor,
2837      we know that it has a lower cost for at least one runtime VF.
2838      However, we don't know how likely that VF is.
2839
2840      One option would be to compare the costs for the estimated VFs.
2841      The problem is that that can put too much pressure on the cost
2842      model.  E.g. if the estimated VF is also the lowest possible VF,
2843      and if old_loop_vinfo is 1 unit worse than new_loop_vinfo
2844      for the estimated VF, we'd then choose new_loop_vinfo even
2845      though (a) new_loop_vinfo might not actually be better than
2846      old_loop_vinfo for that VF and (b) it would be significantly
2847      worse at larger VFs.
2848
2849      Here we go for a hacky compromise: pick new_loop_vinfo if it is
2850      no more expensive than old_loop_vinfo even after doubling the
2851      estimated old_loop_vinfo VF.  For all but trivial loops, this
2852      ensures that we only pick new_loop_vinfo if it is significantly
2853      better than old_loop_vinfo at the estimated VF.  */
2854
2855   if (est_rel_old_min != est_rel_new_min
2856       || est_rel_old_max != est_rel_new_max)
2857     {
2858       HOST_WIDE_INT est_rel_new_likely
2859         = estimated_poly_value (rel_new, POLY_VALUE_LIKELY);
2860       HOST_WIDE_INT est_rel_old_likely
2861         = estimated_poly_value (rel_old, POLY_VALUE_LIKELY);
2862
2863       return est_rel_new_likely * 2 <= est_rel_old_likely;
2864     }
2865
2866   /* If there's nothing to choose between the loop bodies, see whether
2867      there's a difference in the prologue and epilogue costs.  */
2868   if (new_loop_vinfo->vec_outside_cost != old_loop_vinfo->vec_outside_cost)
2869     return new_loop_vinfo->vec_outside_cost < old_loop_vinfo->vec_outside_cost;
2870
2871   return false;
2872 }
2873
2874 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO.  Return
2875    true if we should.  */
2876
2877 static bool
2878 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
2879                         loop_vec_info old_loop_vinfo)
2880 {
2881   if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
2882     return false;
2883
2884   if (dump_enabled_p ())
2885     dump_printf_loc (MSG_NOTE, vect_location,
2886                      "***** Preferring vector mode %s to vector mode %s\n",
2887                      GET_MODE_NAME (new_loop_vinfo->vector_mode),
2888                      GET_MODE_NAME (old_loop_vinfo->vector_mode));
2889   return true;
2890 }
2891
2892 /* Analyze LOOP with VECTOR_MODES[MODE_I] and as epilogue if MAIN_LOOP_VINFO is
2893    not NULL.  Set AUTODETECTED_VECTOR_MODE if VOIDmode and advance
2894    MODE_I to the next mode useful to analyze.
2895    Return the loop_vinfo on success and wrapped null on failure.  */
2896
2897 static opt_loop_vec_info
2898 vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared,
2899                      const vect_loop_form_info *loop_form_info,
2900                      loop_vec_info main_loop_vinfo,
2901                      const vector_modes &vector_modes, unsigned &mode_i,
2902                      machine_mode &autodetected_vector_mode,
2903                      bool &fatal)
2904 {
2905   loop_vec_info loop_vinfo
2906     = vect_create_loop_vinfo (loop, shared, loop_form_info);
2907   if (main_loop_vinfo)
2908     LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = main_loop_vinfo;
2909
2910   machine_mode vector_mode = vector_modes[mode_i];
2911   loop_vinfo->vector_mode = vector_mode;
2912
2913   /* Run the main analysis.  */
2914   opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal);
2915   if (dump_enabled_p ())
2916     dump_printf_loc (MSG_NOTE, vect_location,
2917                      "***** Analysis %s with vector mode %s\n",
2918                      res ? "succeeded" : " failed",
2919                      GET_MODE_NAME (loop_vinfo->vector_mode));
2920
2921   /* Remember the autodetected vector mode.  */
2922   if (vector_mode == VOIDmode)
2923     autodetected_vector_mode = loop_vinfo->vector_mode;
2924
2925   /* Advance mode_i, first skipping modes that would result in the
2926      same analysis result.  */
2927   while (mode_i + 1 < vector_modes.length ()
2928          && vect_chooses_same_modes_p (loop_vinfo,
2929                                        vector_modes[mode_i + 1]))
2930     {
2931       if (dump_enabled_p ())
2932         dump_printf_loc (MSG_NOTE, vect_location,
2933                          "***** The result for vector mode %s would"
2934                          " be the same\n",
2935                          GET_MODE_NAME (vector_modes[mode_i + 1]));
2936       mode_i += 1;
2937     }
2938   if (mode_i + 1 < vector_modes.length ()
2939       && VECTOR_MODE_P (autodetected_vector_mode)
2940       && (related_vector_mode (vector_modes[mode_i + 1],
2941                                GET_MODE_INNER (autodetected_vector_mode))
2942           == autodetected_vector_mode)
2943       && (related_vector_mode (autodetected_vector_mode,
2944                                GET_MODE_INNER (vector_modes[mode_i + 1]))
2945           == vector_modes[mode_i + 1]))
2946     {
2947       if (dump_enabled_p ())
2948         dump_printf_loc (MSG_NOTE, vect_location,
2949                          "***** Skipping vector mode %s, which would"
2950                          " repeat the analysis for %s\n",
2951                          GET_MODE_NAME (vector_modes[mode_i + 1]),
2952                          GET_MODE_NAME (autodetected_vector_mode));
2953       mode_i += 1;
2954     }
2955   mode_i++;
2956
2957   if (!res)
2958     {
2959       delete loop_vinfo;
2960       if (fatal)
2961         gcc_checking_assert (main_loop_vinfo == NULL);
2962       return opt_loop_vec_info::propagate_failure (res);
2963     }
2964
2965   return opt_loop_vec_info::success (loop_vinfo);
2966 }
2967
2968 /* Function vect_analyze_loop.
2969
2970    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2971    for it.  The different analyses will record information in the
2972    loop_vec_info struct.  */
2973 opt_loop_vec_info
2974 vect_analyze_loop (class loop *loop, vec_info_shared *shared)
2975 {
2976   DUMP_VECT_SCOPE ("analyze_loop_nest");
2977
2978   if (loop_outer (loop)
2979       && loop_vec_info_for_loop (loop_outer (loop))
2980       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2981     return opt_loop_vec_info::failure_at (vect_location,
2982                                           "outer-loop already vectorized.\n");
2983
2984   if (!find_loop_nest (loop, &shared->loop_nest))
2985     return opt_loop_vec_info::failure_at
2986       (vect_location,
2987        "not vectorized: loop nest containing two or more consecutive inner"
2988        " loops cannot be vectorized\n");
2989
2990   /* Analyze the loop form.  */
2991   vect_loop_form_info loop_form_info;
2992   opt_result res = vect_analyze_loop_form (loop, &loop_form_info);
2993   if (!res)
2994     {
2995       if (dump_enabled_p ())
2996         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2997                          "bad loop form.\n");
2998       return opt_loop_vec_info::propagate_failure (res);
2999     }
3000
3001   auto_vector_modes vector_modes;
3002   /* Autodetect first vector size we try.  */
3003   vector_modes.safe_push (VOIDmode);
3004   unsigned int autovec_flags
3005     = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
3006                                                     loop->simdlen != 0);
3007   bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
3008                              && !unlimited_cost_model (loop));
3009   machine_mode autodetected_vector_mode = VOIDmode;
3010   opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
3011   unsigned int mode_i = 0;
3012   unsigned int first_loop_i = 0;
3013   unsigned int first_loop_next_i = 0;
3014   unsigned HOST_WIDE_INT simdlen = loop->simdlen;
3015
3016   /* First determine the main loop vectorization mode, either the first
3017      one that works, starting with auto-detecting the vector mode and then
3018      following the targets order of preference, or the one with the
3019      lowest cost if pick_lowest_cost_p.  */
3020   while (1)
3021     {
3022       unsigned int loop_vinfo_i = mode_i;
3023       bool fatal;
3024       opt_loop_vec_info loop_vinfo
3025         = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3026                                NULL, vector_modes, mode_i,
3027                                autodetected_vector_mode, fatal);
3028       if (fatal)
3029         break;
3030
3031       if (loop_vinfo)
3032         {
3033           /* Once we hit the desired simdlen for the first time,
3034              discard any previous attempts.  */
3035           if (simdlen
3036               && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
3037             {
3038               delete first_loop_vinfo;
3039               first_loop_vinfo = opt_loop_vec_info::success (NULL);
3040               simdlen = 0;
3041             }
3042           else if (pick_lowest_cost_p
3043                    && first_loop_vinfo
3044                    && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
3045             {
3046               /* Pick loop_vinfo over first_loop_vinfo.  */
3047               delete first_loop_vinfo;
3048               first_loop_vinfo = opt_loop_vec_info::success (NULL);
3049             }
3050           if (first_loop_vinfo == NULL)
3051             {
3052               first_loop_vinfo = loop_vinfo;
3053               first_loop_i = loop_vinfo_i;
3054               first_loop_next_i = mode_i;
3055             }
3056           else
3057             {
3058               delete loop_vinfo;
3059               loop_vinfo = opt_loop_vec_info::success (NULL);
3060             }
3061
3062           /* Commit to first_loop_vinfo if we have no reason to try
3063              alternatives.  */
3064           if (!simdlen && !pick_lowest_cost_p)
3065             break;
3066         }
3067       if (mode_i == vector_modes.length ()
3068           || autodetected_vector_mode == VOIDmode)
3069         break;
3070
3071       /* Try the next biggest vector size.  */
3072       if (dump_enabled_p ())
3073         dump_printf_loc (MSG_NOTE, vect_location,
3074                          "***** Re-trying analysis with vector mode %s\n",
3075                          GET_MODE_NAME (vector_modes[mode_i]));
3076     }
3077   if (!first_loop_vinfo)
3078     return opt_loop_vec_info::propagate_failure (res);
3079
3080   if (dump_enabled_p ())
3081     dump_printf_loc (MSG_NOTE, vect_location,
3082                      "***** Choosing vector mode %s\n",
3083                      GET_MODE_NAME (first_loop_vinfo->vector_mode));
3084
3085   /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
3086      enabled, SIMDUID is not set, it is the innermost loop and we have
3087      either already found the loop's SIMDLEN or there was no SIMDLEN to
3088      begin with.
3089      TODO: Enable epilogue vectorization for loops with SIMDUID set.  */
3090   bool vect_epilogues = (!simdlen
3091                          && loop->inner == NULL
3092                          && param_vect_epilogues_nomask
3093                          && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
3094                          && !loop->simduid);
3095   if (!vect_epilogues)
3096     return first_loop_vinfo;
3097
3098   /* Now analyze first_loop_vinfo for epilogue vectorization.  */
3099   poly_uint64 lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
3100
3101   /* Handle the case that the original loop can use partial
3102      vectorization, but want to only adopt it for the epilogue.
3103      The retry should be in the same mode as original.  */
3104   if (LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (first_loop_vinfo))
3105     {
3106       gcc_assert (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (first_loop_vinfo)
3107                   && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (first_loop_vinfo));
3108       if (dump_enabled_p ())
3109         dump_printf_loc (MSG_NOTE, vect_location,
3110                          "***** Re-trying analysis with same vector mode"
3111                          " %s for epilogue with partial vectors.\n",
3112                          GET_MODE_NAME (first_loop_vinfo->vector_mode));
3113       mode_i = first_loop_i;
3114     }
3115   else
3116     {
3117       mode_i = first_loop_next_i;
3118       if (mode_i == vector_modes.length ())
3119         return first_loop_vinfo;
3120     }
3121
3122   /* ???  If first_loop_vinfo was using VOIDmode then we probably
3123      want to instead search for the corresponding mode in vector_modes[].  */
3124
3125   while (1)
3126     {
3127       bool fatal;
3128       opt_loop_vec_info loop_vinfo
3129         = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3130                                first_loop_vinfo,
3131                                vector_modes, mode_i,
3132                                autodetected_vector_mode, fatal);
3133       if (fatal)
3134         break;
3135
3136       if (loop_vinfo)
3137         {
3138           if (pick_lowest_cost_p)
3139             {
3140               /* Keep trying to roll back vectorization attempts while the
3141                  loop_vec_infos they produced were worse than this one.  */
3142               vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
3143               while (!vinfos.is_empty ()
3144                      && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
3145                 {
3146                   gcc_assert (vect_epilogues);
3147                   delete vinfos.pop ();
3148                 }
3149             }
3150           /* For now only allow one epilogue loop.  */
3151           if (first_loop_vinfo->epilogue_vinfos.is_empty ())
3152             {
3153               first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
3154               poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3155               gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
3156                           || maybe_ne (lowest_th, 0U));
3157               /* Keep track of the known smallest versioning
3158                  threshold.  */
3159               if (ordered_p (lowest_th, th))
3160                 lowest_th = ordered_min (lowest_th, th);
3161             }
3162           else
3163             {
3164               delete loop_vinfo;
3165               loop_vinfo = opt_loop_vec_info::success (NULL);
3166             }
3167
3168           /* For now only allow one epilogue loop, but allow
3169              pick_lowest_cost_p to replace it, so commit to the
3170              first epilogue if we have no reason to try alternatives.  */
3171           if (!pick_lowest_cost_p)
3172             break;
3173         }
3174
3175       if (mode_i == vector_modes.length ())
3176         break;
3177
3178       /* Try the next biggest vector size.  */
3179       if (dump_enabled_p ())
3180         dump_printf_loc (MSG_NOTE, vect_location,
3181                          "***** Re-trying epilogue analysis with vector "
3182                          "mode %s\n", GET_MODE_NAME (vector_modes[mode_i]));
3183     }
3184
3185   if (!first_loop_vinfo->epilogue_vinfos.is_empty ())
3186     {
3187       LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
3188       if (dump_enabled_p ())
3189         dump_printf_loc (MSG_NOTE, vect_location,
3190                          "***** Choosing epilogue vector mode %s\n",
3191                          GET_MODE_NAME
3192                            (first_loop_vinfo->epilogue_vinfos[0]->vector_mode));
3193     }
3194
3195   return first_loop_vinfo;
3196 }
3197
3198 /* Return true if there is an in-order reduction function for CODE, storing
3199    it in *REDUC_FN if so.  */
3200
3201 static bool
3202 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
3203 {
3204   switch (code)
3205     {
3206     case PLUS_EXPR:
3207       *reduc_fn = IFN_FOLD_LEFT_PLUS;
3208       return true;
3209
3210     default:
3211       return false;
3212     }
3213 }
3214
3215 /* Function reduction_fn_for_scalar_code
3216
3217    Input:
3218    CODE - tree_code of a reduction operations.
3219
3220    Output:
3221    REDUC_FN - the corresponding internal function to be used to reduce the
3222       vector of partial results into a single scalar result, or IFN_LAST
3223       if the operation is a supported reduction operation, but does not have
3224       such an internal function.
3225
3226    Return FALSE if CODE currently cannot be vectorized as reduction.  */
3227
3228 bool
3229 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
3230 {
3231   switch (code)
3232     {
3233       case MAX_EXPR:
3234         *reduc_fn = IFN_REDUC_MAX;
3235         return true;
3236
3237       case MIN_EXPR:
3238         *reduc_fn = IFN_REDUC_MIN;
3239         return true;
3240
3241       case PLUS_EXPR:
3242         *reduc_fn = IFN_REDUC_PLUS;
3243         return true;
3244
3245       case BIT_AND_EXPR:
3246         *reduc_fn = IFN_REDUC_AND;
3247         return true;
3248
3249       case BIT_IOR_EXPR:
3250         *reduc_fn = IFN_REDUC_IOR;
3251         return true;
3252
3253       case BIT_XOR_EXPR:
3254         *reduc_fn = IFN_REDUC_XOR;
3255         return true;
3256
3257       case MULT_EXPR:
3258       case MINUS_EXPR:
3259         *reduc_fn = IFN_LAST;
3260         return true;
3261
3262       default:
3263        return false;
3264     }
3265 }
3266
3267 /* If there is a neutral value X such that a reduction would not be affected
3268    by the introduction of additional X elements, return that X, otherwise
3269    return null.  CODE is the code of the reduction and SCALAR_TYPE is type
3270    of the scalar elements.  If the reduction has just a single initial value
3271    then INITIAL_VALUE is that value, otherwise it is null.  */
3272
3273 static tree
3274 neutral_op_for_reduction (tree scalar_type, tree_code code, tree initial_value)
3275 {
3276   switch (code)
3277     {
3278     case WIDEN_SUM_EXPR:
3279     case DOT_PROD_EXPR:
3280     case SAD_EXPR:
3281     case PLUS_EXPR:
3282     case MINUS_EXPR:
3283     case BIT_IOR_EXPR:
3284     case BIT_XOR_EXPR:
3285       return build_zero_cst (scalar_type);
3286
3287     case MULT_EXPR:
3288       return build_one_cst (scalar_type);
3289
3290     case BIT_AND_EXPR:
3291       return build_all_ones_cst (scalar_type);
3292
3293     case MAX_EXPR:
3294     case MIN_EXPR:
3295       return initial_value;
3296
3297     default:
3298       return NULL_TREE;
3299     }
3300 }
3301
3302 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
3303    STMT is printed with a message MSG. */
3304
3305 static void
3306 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3307 {
3308   dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3309 }
3310
3311 /* Return true if we need an in-order reduction for operation CODE
3312    on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3313    overflow must wrap.  */
3314
3315 bool
3316 needs_fold_left_reduction_p (tree type, tree_code code)
3317 {
3318   /* CHECKME: check for !flag_finite_math_only too?  */
3319   if (SCALAR_FLOAT_TYPE_P (type))
3320     switch (code)
3321       {
3322       case MIN_EXPR:
3323       case MAX_EXPR:
3324         return false;
3325
3326       default:
3327         return !flag_associative_math;
3328       }
3329
3330   if (INTEGRAL_TYPE_P (type))
3331     {
3332       if (!operation_no_trapping_overflow (type, code))
3333         return true;
3334       return false;
3335     }
3336
3337   if (SAT_FIXED_POINT_TYPE_P (type))
3338     return true;
3339
3340   return false;
3341 }
3342
3343 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3344    has a handled computation expression.  Store the main reduction
3345    operation in *CODE.  */
3346
3347 static bool
3348 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3349                       tree loop_arg, enum tree_code *code,
3350                       vec<std::pair<ssa_op_iter, use_operand_p> > &path)
3351 {
3352   auto_bitmap visited;
3353   tree lookfor = PHI_RESULT (phi);
3354   ssa_op_iter curri;
3355   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
3356   while (USE_FROM_PTR (curr) != loop_arg)
3357     curr = op_iter_next_use (&curri);
3358   curri.i = curri.numops;
3359   do
3360     {
3361       path.safe_push (std::make_pair (curri, curr));
3362       tree use = USE_FROM_PTR (curr);
3363       if (use == lookfor)
3364         break;
3365       gimple *def = SSA_NAME_DEF_STMT (use);
3366       if (gimple_nop_p (def)
3367           || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3368         {
3369 pop:
3370           do
3371             {
3372               std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3373               curri = x.first;
3374               curr = x.second;
3375               do
3376                 curr = op_iter_next_use (&curri);
3377               /* Skip already visited or non-SSA operands (from iterating
3378                  over PHI args).  */
3379               while (curr != NULL_USE_OPERAND_P
3380                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3381                          || ! bitmap_set_bit (visited,
3382                                               SSA_NAME_VERSION
3383                                                 (USE_FROM_PTR (curr)))));
3384             }
3385           while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
3386           if (curr == NULL_USE_OPERAND_P)
3387             break;
3388         }
3389       else
3390         {
3391           if (gimple_code (def) == GIMPLE_PHI)
3392             curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
3393           else
3394             curr = op_iter_init_use (&curri, def, SSA_OP_USE);
3395           while (curr != NULL_USE_OPERAND_P
3396                  && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3397                      || ! bitmap_set_bit (visited,
3398                                           SSA_NAME_VERSION
3399                                             (USE_FROM_PTR (curr)))))
3400             curr = op_iter_next_use (&curri);
3401           if (curr == NULL_USE_OPERAND_P)
3402             goto pop;
3403         }
3404     }
3405   while (1);
3406   if (dump_file && (dump_flags & TDF_DETAILS))
3407     {
3408       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
3409       unsigned i;
3410       std::pair<ssa_op_iter, use_operand_p> *x;
3411       FOR_EACH_VEC_ELT (path, i, x)
3412         dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
3413       dump_printf (MSG_NOTE, "\n");
3414     }
3415
3416   /* Check whether the reduction path detected is valid.  */
3417   bool fail = path.length () == 0;
3418   bool neg = false;
3419   int sign = -1;
3420   *code = ERROR_MARK;
3421   for (unsigned i = 1; i < path.length (); ++i)
3422     {
3423       gimple *use_stmt = USE_STMT (path[i].second);
3424       tree op = USE_FROM_PTR (path[i].second);
3425       if (! is_gimple_assign (use_stmt)
3426           /* The following make sure we can compute the operand index
3427              easily plus it mostly disallows chaining via COND_EXPR condition
3428              operands.  */
3429           || (gimple_assign_rhs1_ptr (use_stmt) != path[i].second->use
3430               && (gimple_num_ops (use_stmt) <= 2
3431                   || gimple_assign_rhs2_ptr (use_stmt) != path[i].second->use)
3432               && (gimple_num_ops (use_stmt) <= 3
3433                   || gimple_assign_rhs3_ptr (use_stmt) != path[i].second->use)))
3434         {
3435           fail = true;
3436           break;
3437         }
3438       tree_code use_code = gimple_assign_rhs_code (use_stmt);
3439       if (use_code == MINUS_EXPR)
3440         {
3441           use_code = PLUS_EXPR;
3442           /* Track whether we negate the reduction value each iteration.  */
3443           if (gimple_assign_rhs2 (use_stmt) == op)
3444             neg = ! neg;
3445         }
3446       if (CONVERT_EXPR_CODE_P (use_code)
3447           && tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (use_stmt)),
3448                                     TREE_TYPE (gimple_assign_rhs1 (use_stmt))))
3449         ;
3450       else if (*code == ERROR_MARK)
3451         {
3452           *code = use_code;
3453           sign = TYPE_SIGN (TREE_TYPE (gimple_assign_lhs (use_stmt)));
3454         }
3455       else if (use_code != *code)
3456         {
3457           fail = true;
3458           break;
3459         }
3460       else if ((use_code == MIN_EXPR
3461                 || use_code == MAX_EXPR)
3462                && sign != TYPE_SIGN (TREE_TYPE (gimple_assign_lhs (use_stmt))))
3463         {
3464           fail = true;
3465           break;
3466         }
3467       /* Check there's only a single stmt the op is used on.  For the
3468          not value-changing tail and the last stmt allow out-of-loop uses.
3469          ???  We could relax this and handle arbitrary live stmts by
3470          forcing a scalar epilogue for example.  */
3471       imm_use_iterator imm_iter;
3472       gimple *op_use_stmt;
3473       unsigned cnt = 0;
3474       FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op)
3475         if (!is_gimple_debug (op_use_stmt)
3476             && (*code != ERROR_MARK
3477                 || flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt))))
3478           {
3479             /* We want to allow x + x but not x < 1 ? x : 2.  */
3480             if (is_gimple_assign (op_use_stmt)
3481                 && gimple_assign_rhs_code (op_use_stmt) == COND_EXPR)
3482               {
3483                 use_operand_p use_p;
3484                 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
3485                   cnt++;
3486               }
3487             else
3488               cnt++;
3489           }
3490       if (cnt != 1)
3491         {
3492           fail = true;
3493           break;
3494         }
3495     }
3496   return ! fail && ! neg && *code != ERROR_MARK;
3497 }
3498
3499 bool
3500 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3501                       tree loop_arg, enum tree_code code)
3502 {
3503   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3504   enum tree_code code_;
3505   return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
3506           && code_ == code);
3507 }
3508
3509
3510
3511 /* Function vect_is_simple_reduction
3512
3513    (1) Detect a cross-iteration def-use cycle that represents a simple
3514    reduction computation.  We look for the following pattern:
3515
3516    loop_header:
3517      a1 = phi < a0, a2 >
3518      a3 = ...
3519      a2 = operation (a3, a1)
3520
3521    or
3522
3523    a3 = ...
3524    loop_header:
3525      a1 = phi < a0, a2 >
3526      a2 = operation (a3, a1)
3527
3528    such that:
3529    1. operation is commutative and associative and it is safe to
3530       change the order of the computation
3531    2. no uses for a2 in the loop (a2 is used out of the loop)
3532    3. no uses of a1 in the loop besides the reduction operation
3533    4. no uses of a1 outside the loop.
3534
3535    Conditions 1,4 are tested here.
3536    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
3537
3538    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
3539    nested cycles.
3540
3541    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
3542    reductions:
3543
3544      a1 = phi < a0, a2 >
3545      inner loop (def of a3)
3546      a2 = phi < a3 >
3547
3548    (4) Detect condition expressions, ie:
3549      for (int i = 0; i < N; i++)
3550        if (a[i] < val)
3551         ret_val = a[i];
3552
3553 */
3554
3555 static stmt_vec_info
3556 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3557                           bool *double_reduc, bool *reduc_chain_p)
3558 {
3559   gphi *phi = as_a <gphi *> (phi_info->stmt);
3560   gimple *phi_use_stmt = NULL;
3561   imm_use_iterator imm_iter;
3562   use_operand_p use_p;
3563
3564   *double_reduc = false;
3565   *reduc_chain_p = false;
3566   STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
3567
3568   tree phi_name = PHI_RESULT (phi);
3569   /* ???  If there are no uses of the PHI result the inner loop reduction
3570      won't be detected as possibly double-reduction by vectorizable_reduction
3571      because that tries to walk the PHI arg from the preheader edge which
3572      can be constant.  See PR60382.  */
3573   if (has_zero_uses (phi_name))
3574     return NULL;
3575   class loop *loop = (gimple_bb (phi))->loop_father;
3576   unsigned nphi_def_loop_uses = 0;
3577   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
3578     {
3579       gimple *use_stmt = USE_STMT (use_p);
3580       if (is_gimple_debug (use_stmt))
3581         continue;
3582
3583       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3584         {
3585           if (dump_enabled_p ())
3586             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3587                              "intermediate value used outside loop.\n");
3588
3589           return NULL;
3590         }
3591
3592       nphi_def_loop_uses++;
3593       phi_use_stmt = use_stmt;
3594     }
3595
3596   tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
3597   if (TREE_CODE (latch_def) != SSA_NAME)
3598     {
3599       if (dump_enabled_p ())
3600         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3601                          "reduction: not ssa_name: %T\n", latch_def);
3602       return NULL;
3603     }
3604
3605   stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
3606   if (!def_stmt_info
3607       || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
3608     return NULL;
3609
3610   bool nested_in_vect_loop
3611     = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
3612   unsigned nlatch_def_loop_uses = 0;
3613   auto_vec<gphi *, 3> lcphis;
3614   bool inner_loop_of_double_reduc = false;
3615   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
3616     {
3617       gimple *use_stmt = USE_STMT (use_p);
3618       if (is_gimple_debug (use_stmt))
3619         continue;
3620       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3621         nlatch_def_loop_uses++;
3622       else
3623         {
3624           /* We can have more than one loop-closed PHI.  */
3625           lcphis.safe_push (as_a <gphi *> (use_stmt));
3626           if (nested_in_vect_loop
3627               && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
3628                   == vect_double_reduction_def))
3629             inner_loop_of_double_reduc = true;
3630         }
3631     }
3632
3633   /* If we are vectorizing an inner reduction we are executing that
3634      in the original order only in case we are not dealing with a
3635      double reduction.  */
3636   if (nested_in_vect_loop && !inner_loop_of_double_reduc)
3637     {
3638       if (dump_enabled_p ())
3639         report_vect_op (MSG_NOTE, def_stmt_info->stmt,
3640                         "detected nested cycle: ");
3641       return def_stmt_info;
3642     }
3643
3644   /* If this isn't a nested cycle or if the nested cycle reduction value
3645      is used ouside of the inner loop we cannot handle uses of the reduction
3646      value.  */
3647   if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
3648     {
3649       if (dump_enabled_p ())
3650         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3651                          "reduction used in loop.\n");
3652       return NULL;
3653     }
3654
3655   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3656      defined in the inner loop.  */
3657   if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
3658     {
3659       tree op1 = PHI_ARG_DEF (def_stmt, 0);
3660       if (gimple_phi_num_args (def_stmt) != 1
3661           || TREE_CODE (op1) != SSA_NAME)
3662         {
3663           if (dump_enabled_p ())
3664             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3665                              "unsupported phi node definition.\n");
3666
3667           return NULL;
3668         }
3669
3670       gimple *def1 = SSA_NAME_DEF_STMT (op1);
3671       if (gimple_bb (def1)
3672           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3673           && loop->inner
3674           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3675           && is_gimple_assign (def1)
3676           && is_a <gphi *> (phi_use_stmt)
3677           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3678         {
3679           if (dump_enabled_p ())
3680             report_vect_op (MSG_NOTE, def_stmt,
3681                             "detected double reduction: ");
3682
3683           *double_reduc = true;
3684           return def_stmt_info;
3685         }
3686
3687       return NULL;
3688     }
3689
3690   /* Look for the expression computing latch_def from then loop PHI result.  */
3691   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3692   enum tree_code code;
3693   if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
3694                             path))
3695     {
3696       STMT_VINFO_REDUC_CODE (phi_info) = code;
3697       if (code == COND_EXPR && !nested_in_vect_loop)
3698         STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
3699
3700       /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
3701          reduction chain for which the additional restriction is that
3702          all operations in the chain are the same.  */
3703       auto_vec<stmt_vec_info, 8> reduc_chain;
3704       unsigned i;
3705       bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
3706       for (i = path.length () - 1; i >= 1; --i)
3707         {
3708           gimple *stmt = USE_STMT (path[i].second);
3709           stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
3710           STMT_VINFO_REDUC_IDX (stmt_info)
3711             = path[i].second->use - gimple_assign_rhs1_ptr (stmt);
3712           enum tree_code stmt_code = gimple_assign_rhs_code (stmt);
3713           bool leading_conversion = (CONVERT_EXPR_CODE_P (stmt_code)
3714                                      && (i == 1 || i == path.length () - 1));
3715           if ((stmt_code != code && !leading_conversion)
3716               /* We can only handle the final value in epilogue
3717                  generation for reduction chains.  */
3718               || (i != 1 && !has_single_use (gimple_assign_lhs (stmt))))
3719             is_slp_reduc = false;
3720           /* For reduction chains we support a trailing/leading
3721              conversions.  We do not store those in the actual chain.  */
3722           if (leading_conversion)
3723             continue;
3724           reduc_chain.safe_push (stmt_info);
3725         }
3726       if (is_slp_reduc && reduc_chain.length () > 1)
3727         {
3728           for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
3729             {
3730               REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
3731               REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
3732             }
3733           REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
3734           REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
3735
3736           /* Save the chain for further analysis in SLP detection.  */
3737           LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
3738           REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
3739
3740           *reduc_chain_p = true;
3741           if (dump_enabled_p ())
3742             dump_printf_loc (MSG_NOTE, vect_location,
3743                             "reduction: detected reduction chain\n");
3744         }
3745       else if (dump_enabled_p ())
3746         dump_printf_loc (MSG_NOTE, vect_location,
3747                          "reduction: detected reduction\n");
3748
3749       return def_stmt_info;
3750     }
3751
3752   if (dump_enabled_p ())
3753     dump_printf_loc (MSG_NOTE, vect_location,
3754                      "reduction: unknown pattern\n");
3755
3756   return NULL;
3757 }
3758
3759 /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
3760    PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
3761    or -1 if not known.  */
3762
3763 static int
3764 vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
3765 {
3766   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3767   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
3768     {
3769       if (dump_enabled_p ())
3770         dump_printf_loc (MSG_NOTE, vect_location,
3771                          "cost model: epilogue peel iters set to vf/2 "
3772                          "because loop iterations are unknown .\n");
3773       return assumed_vf / 2;
3774     }
3775   else
3776     {
3777       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3778       peel_iters_prologue = MIN (niters, peel_iters_prologue);
3779       int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3780       /* If we need to peel for gaps, but no peeling is required, we have to
3781          peel VF iterations.  */
3782       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
3783         peel_iters_epilogue = assumed_vf;
3784       return peel_iters_epilogue;
3785     }
3786 }
3787
3788 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
3789 int
3790 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3791                              int *peel_iters_epilogue,
3792                              stmt_vector_for_cost *scalar_cost_vec,
3793                              stmt_vector_for_cost *prologue_cost_vec,
3794                              stmt_vector_for_cost *epilogue_cost_vec)
3795 {
3796   int retval = 0;
3797
3798   *peel_iters_epilogue
3799     = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
3800
3801   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3802     {
3803       /* If peeled iterations are known but number of scalar loop
3804          iterations are unknown, count a taken branch per peeled loop.  */
3805       if (peel_iters_prologue > 0)
3806         retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3807                                    NULL, NULL_TREE, 0, vect_prologue);
3808       if (*peel_iters_epilogue > 0)
3809         retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
3810                                     NULL, NULL_TREE, 0, vect_epilogue);
3811     }
3812
3813   stmt_info_for_cost *si;
3814   int j;
3815   if (peel_iters_prologue)
3816     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3817       retval += record_stmt_cost (prologue_cost_vec,
3818                                   si->count * peel_iters_prologue,
3819                                   si->kind, si->stmt_info, si->misalign,
3820                                   vect_prologue);
3821   if (*peel_iters_epilogue)
3822     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3823       retval += record_stmt_cost (epilogue_cost_vec,
3824                                   si->count * *peel_iters_epilogue,
3825                                   si->kind, si->stmt_info, si->misalign,
3826                                   vect_epilogue);
3827
3828   return retval;
3829 }
3830
3831 /* Function vect_estimate_min_profitable_iters
3832
3833    Return the number of iterations required for the vector version of the
3834    loop to be profitable relative to the cost of the scalar version of the
3835    loop.
3836
3837    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3838    of iterations for vectorization.  -1 value means loop vectorization
3839    is not profitable.  This returned value may be used for dynamic
3840    profitability check.
3841
3842    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3843    for static check against estimated number of iterations.  */
3844
3845 static void
3846 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3847                                     int *ret_min_profitable_niters,
3848                                     int *ret_min_profitable_estimate)
3849 {
3850   int min_profitable_iters;
3851   int min_profitable_estimate;
3852   int peel_iters_prologue;
3853   int peel_iters_epilogue;
3854   unsigned vec_inside_cost = 0;
3855   int vec_outside_cost = 0;
3856   unsigned vec_prologue_cost = 0;
3857   unsigned vec_epilogue_cost = 0;
3858   int scalar_single_iter_cost = 0;
3859   int scalar_outside_cost = 0;
3860   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3861   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3862   vector_costs *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3863
3864   /* Cost model disabled.  */
3865   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3866     {
3867       if (dump_enabled_p ())
3868         dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3869       *ret_min_profitable_niters = 0;
3870       *ret_min_profitable_estimate = 0;
3871       return;
3872     }
3873
3874   /* Requires loop versioning tests to handle misalignment.  */
3875   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3876     {
3877       /*  FIXME: Make cost depend on complexity of individual check.  */
3878       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3879       (void) add_stmt_cost (target_cost_data, len, vector_stmt,
3880                             NULL, NULL_TREE, 0, vect_prologue);
3881       if (dump_enabled_p ())
3882         dump_printf (MSG_NOTE,
3883                      "cost model: Adding cost of checks for loop "
3884                      "versioning to treat misalignment.\n");
3885     }
3886
3887   /* Requires loop versioning with alias checks.  */
3888   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3889     {
3890       /*  FIXME: Make cost depend on complexity of individual check.  */
3891       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3892       (void) add_stmt_cost (target_cost_data, len, vector_stmt,
3893                             NULL, NULL_TREE, 0, vect_prologue);
3894       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3895       if (len)
3896         /* Count LEN - 1 ANDs and LEN comparisons.  */
3897         (void) add_stmt_cost (target_cost_data, len * 2 - 1,
3898                               scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
3899       len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3900       if (len)
3901         {
3902           /* Count LEN - 1 ANDs and LEN comparisons.  */
3903           unsigned int nstmts = len * 2 - 1;
3904           /* +1 for each bias that needs adding.  */
3905           for (unsigned int i = 0; i < len; ++i)
3906             if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3907               nstmts += 1;
3908           (void) add_stmt_cost (target_cost_data, nstmts,
3909                                 scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
3910         }
3911       if (dump_enabled_p ())
3912         dump_printf (MSG_NOTE,
3913                      "cost model: Adding cost of checks for loop "
3914                      "versioning aliasing.\n");
3915     }
3916
3917   /* Requires loop versioning with niter checks.  */
3918   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3919     {
3920       /*  FIXME: Make cost depend on complexity of individual check.  */
3921       (void) add_stmt_cost (target_cost_data, 1, vector_stmt,
3922                             NULL, NULL_TREE, 0, vect_prologue);
3923       if (dump_enabled_p ())
3924         dump_printf (MSG_NOTE,
3925                      "cost model: Adding cost of checks for loop "
3926                      "versioning niters.\n");
3927     }
3928
3929   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3930     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3931                           NULL, NULL_TREE, 0, vect_prologue);
3932
3933   /* Count statements in scalar loop.  Using this as scalar cost for a single
3934      iteration for now.
3935
3936      TODO: Add outer loop support.
3937
3938      TODO: Consider assigning different costs to different scalar
3939      statements.  */
3940
3941   scalar_single_iter_cost
3942     = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3943
3944   /* Add additional cost for the peeled instructions in prologue and epilogue
3945      loop.  (For fully-masked loops there will be no peeling.)
3946
3947      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3948      at compile-time - we assume it's vf/2 (the worst would be vf-1).
3949
3950      TODO: Build an expression that represents peel_iters for prologue and
3951      epilogue to be used in a run-time test.  */
3952
3953   bool prologue_need_br_taken_cost = false;
3954   bool prologue_need_br_not_taken_cost = false;
3955
3956   /* Calculate peel_iters_prologue.  */
3957   if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
3958     peel_iters_prologue = 0;
3959   else if (npeel < 0)
3960     {
3961       peel_iters_prologue = assumed_vf / 2;
3962       if (dump_enabled_p ())
3963         dump_printf (MSG_NOTE, "cost model: "
3964                      "prologue peel iters set to vf/2.\n");
3965
3966       /* If peeled iterations are unknown, count a taken branch and a not taken
3967          branch per peeled loop.  Even if scalar loop iterations are known,
3968          vector iterations are not known since peeled prologue iterations are
3969          not known.  Hence guards remain the same.  */
3970       prologue_need_br_taken_cost = true;
3971       prologue_need_br_not_taken_cost = true;
3972     }
3973   else
3974     {
3975       peel_iters_prologue = npeel;
3976       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
3977         /* If peeled iterations are known but number of scalar loop
3978            iterations are unknown, count a taken branch per peeled loop.  */
3979         prologue_need_br_taken_cost = true;
3980     }
3981
3982   bool epilogue_need_br_taken_cost = false;
3983   bool epilogue_need_br_not_taken_cost = false;
3984
3985   /* Calculate peel_iters_epilogue.  */
3986   if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3987     /* We need to peel exactly one iteration for gaps.  */
3988     peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
3989   else if (npeel < 0)
3990     {
3991       /* If peeling for alignment is unknown, loop bound of main loop
3992          becomes unknown.  */
3993       peel_iters_epilogue = assumed_vf / 2;
3994       if (dump_enabled_p ())
3995         dump_printf (MSG_NOTE, "cost model: "
3996                      "epilogue peel iters set to vf/2 because "
3997                      "peeling for alignment is unknown.\n");
3998
3999       /* See the same reason above in peel_iters_prologue calculation.  */
4000       epilogue_need_br_taken_cost = true;
4001       epilogue_need_br_not_taken_cost = true;
4002     }
4003   else
4004     {
4005       peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
4006       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
4007         /* If peeled iterations are known but number of scalar loop
4008            iterations are unknown, count a taken branch per peeled loop.  */
4009         epilogue_need_br_taken_cost = true;
4010     }
4011
4012   stmt_info_for_cost *si;
4013   int j;
4014   /* Add costs associated with peel_iters_prologue.  */
4015   if (peel_iters_prologue)
4016     FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4017       {
4018         (void) add_stmt_cost (target_cost_data,
4019                               si->count * peel_iters_prologue, si->kind,
4020                               si->stmt_info, si->vectype, si->misalign,
4021                               vect_prologue);
4022       }
4023
4024   /* Add costs associated with peel_iters_epilogue.  */
4025   if (peel_iters_epilogue)
4026     FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4027       {
4028         (void) add_stmt_cost (target_cost_data,
4029                               si->count * peel_iters_epilogue, si->kind,
4030                               si->stmt_info, si->vectype, si->misalign,
4031                               vect_epilogue);
4032       }
4033
4034   /* Add possible cond_branch_taken/cond_branch_not_taken cost.  */
4035
4036   if (prologue_need_br_taken_cost)
4037     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4038                           NULL, NULL_TREE, 0, vect_prologue);
4039
4040   if (prologue_need_br_not_taken_cost)
4041     (void) add_stmt_cost (target_cost_data, 1,
4042                           cond_branch_not_taken, NULL, NULL_TREE, 0,
4043                           vect_prologue);
4044
4045   if (epilogue_need_br_taken_cost)
4046     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4047                           NULL, NULL_TREE, 0, vect_epilogue);
4048
4049   if (epilogue_need_br_not_taken_cost)
4050     (void) add_stmt_cost (target_cost_data, 1,
4051                           cond_branch_not_taken, NULL, NULL_TREE, 0,
4052                           vect_epilogue);
4053
4054   /* Take care of special costs for rgroup controls of partial vectors.  */
4055   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
4056     {
4057       /* Calculate how many masks we need to generate.  */
4058       unsigned int num_masks = 0;
4059       rgroup_controls *rgm;
4060       unsigned int num_vectors_m1;
4061       FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), num_vectors_m1, rgm)
4062         if (rgm->type)
4063           num_masks += num_vectors_m1 + 1;
4064       gcc_assert (num_masks > 0);
4065
4066       /* In the worst case, we need to generate each mask in the prologue
4067          and in the loop body.  One of the loop body mask instructions
4068          replaces the comparison in the scalar loop, and since we don't
4069          count the scalar comparison against the scalar body, we shouldn't
4070          count that vector instruction against the vector body either.
4071
4072          Sometimes we can use unpacks instead of generating prologue
4073          masks and sometimes the prologue mask will fold to a constant,
4074          so the actual prologue cost might be smaller.  However, it's
4075          simpler and safer to use the worst-case cost; if this ends up
4076          being the tie-breaker between vectorizing or not, then it's
4077          probably better not to vectorize.  */
4078       (void) add_stmt_cost (target_cost_data, num_masks,
4079                             vector_stmt, NULL, NULL_TREE, 0, vect_prologue);
4080       (void) add_stmt_cost (target_cost_data, num_masks - 1,
4081                             vector_stmt, NULL, NULL_TREE, 0, vect_body);
4082     }
4083   else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
4084     {
4085       /* Referring to the functions vect_set_loop_condition_partial_vectors
4086          and vect_set_loop_controls_directly, we need to generate each
4087          length in the prologue and in the loop body if required. Although
4088          there are some possible optimizations, we consider the worst case
4089          here.  */
4090
4091       bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
4092       bool need_iterate_p
4093         = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
4094            && !vect_known_niters_smaller_than_vf (loop_vinfo));
4095
4096       /* Calculate how many statements to be added.  */
4097       unsigned int prologue_stmts = 0;
4098       unsigned int body_stmts = 0;
4099
4100       rgroup_controls *rgc;
4101       unsigned int num_vectors_m1;
4102       FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
4103         if (rgc->type)
4104           {
4105             /* May need one SHIFT for nitems_total computation.  */
4106             unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
4107             if (nitems != 1 && !niters_known_p)
4108               prologue_stmts += 1;
4109
4110             /* May need one MAX and one MINUS for wrap around.  */
4111             if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
4112               prologue_stmts += 2;
4113
4114             /* Need one MAX and one MINUS for each batch limit excepting for
4115                the 1st one.  */
4116             prologue_stmts += num_vectors_m1 * 2;
4117
4118             unsigned int num_vectors = num_vectors_m1 + 1;
4119
4120             /* Need to set up lengths in prologue, only one MIN required
4121                for each since start index is zero.  */
4122             prologue_stmts += num_vectors;
4123
4124             /* Each may need two MINs and one MINUS to update lengths in body
4125                for next iteration.  */
4126             if (need_iterate_p)
4127               body_stmts += 3 * num_vectors;
4128           }
4129
4130       (void) add_stmt_cost (target_cost_data, prologue_stmts,
4131                             scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
4132       (void) add_stmt_cost (target_cost_data, body_stmts,
4133                             scalar_stmt, NULL, NULL_TREE, 0, vect_body);
4134     }
4135
4136   /* FORNOW: The scalar outside cost is incremented in one of the
4137      following ways:
4138
4139      1. The vectorizer checks for alignment and aliasing and generates
4140      a condition that allows dynamic vectorization.  A cost model
4141      check is ANDED with the versioning condition.  Hence scalar code
4142      path now has the added cost of the versioning check.
4143
4144        if (cost > th & versioning_check)
4145          jmp to vector code
4146
4147      Hence run-time scalar is incremented by not-taken branch cost.
4148
4149      2. The vectorizer then checks if a prologue is required.  If the
4150      cost model check was not done before during versioning, it has to
4151      be done before the prologue check.
4152
4153        if (cost <= th)
4154          prologue = scalar_iters
4155        if (prologue == 0)
4156          jmp to vector code
4157        else
4158          execute prologue
4159        if (prologue == num_iters)
4160          go to exit
4161
4162      Hence the run-time scalar cost is incremented by a taken branch,
4163      plus a not-taken branch, plus a taken branch cost.
4164
4165      3. The vectorizer then checks if an epilogue is required.  If the
4166      cost model check was not done before during prologue check, it
4167      has to be done with the epilogue check.
4168
4169        if (prologue == 0)
4170          jmp to vector code
4171        else
4172          execute prologue
4173        if (prologue == num_iters)
4174          go to exit
4175        vector code:
4176          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
4177            jmp to epilogue
4178
4179      Hence the run-time scalar cost should be incremented by 2 taken
4180      branches.
4181
4182      TODO: The back end may reorder the BBS's differently and reverse
4183      conditions/branch directions.  Change the estimates below to
4184      something more reasonable.  */
4185
4186   /* If the number of iterations is known and we do not do versioning, we can
4187      decide whether to vectorize at compile time.  Hence the scalar version
4188      do not carry cost model guard costs.  */
4189   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4190       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
4191     {
4192       /* Cost model check occurs at versioning.  */
4193       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4194         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
4195       else
4196         {
4197           /* Cost model check occurs at prologue generation.  */
4198           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
4199             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
4200               + vect_get_stmt_cost (cond_branch_not_taken);
4201           /* Cost model check occurs at epilogue generation.  */
4202           else
4203             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
4204         }
4205     }
4206
4207   /* Complete the target-specific cost calculations.  */
4208   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
4209                &vec_inside_cost, &vec_epilogue_cost);
4210
4211   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
4212
4213   /* Stash the costs so that we can compare two loop_vec_infos.  */
4214   loop_vinfo->vec_inside_cost = vec_inside_cost;
4215   loop_vinfo->vec_outside_cost = vec_outside_cost;
4216
4217   if (dump_enabled_p ())
4218     {
4219       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
4220       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
4221                    vec_inside_cost);
4222       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
4223                    vec_prologue_cost);
4224       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
4225                    vec_epilogue_cost);
4226       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
4227                    scalar_single_iter_cost);
4228       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
4229                    scalar_outside_cost);
4230       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
4231                    vec_outside_cost);
4232       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
4233                    peel_iters_prologue);
4234       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
4235                    peel_iters_epilogue);
4236     }
4237
4238   /* Calculate number of iterations required to make the vector version
4239      profitable, relative to the loop bodies only.  The following condition
4240      must hold true:
4241      SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
4242      where
4243      SIC = scalar iteration cost, VIC = vector iteration cost,
4244      VOC = vector outside cost, VF = vectorization factor,
4245      NPEEL = prologue iterations + epilogue iterations,
4246      SOC = scalar outside cost for run time cost model check.  */
4247
4248   int saving_per_viter = (scalar_single_iter_cost * assumed_vf
4249                           - vec_inside_cost);
4250   if (saving_per_viter <= 0)
4251     {
4252       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
4253         warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
4254                     "vectorization did not happen for a simd loop");
4255
4256       if (dump_enabled_p ())
4257         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4258                          "cost model: the vector iteration cost = %d "
4259                          "divided by the scalar iteration cost = %d "
4260                          "is greater or equal to the vectorization factor = %d"
4261                          ".\n",
4262                          vec_inside_cost, scalar_single_iter_cost, assumed_vf);
4263       *ret_min_profitable_niters = -1;
4264       *ret_min_profitable_estimate = -1;
4265       return;
4266     }
4267
4268   /* ??? The "if" arm is written to handle all cases; see below for what
4269      we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
4270   if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4271     {
4272       /* Rewriting the condition above in terms of the number of
4273          vector iterations (vniters) rather than the number of
4274          scalar iterations (niters) gives:
4275
4276          SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
4277
4278          <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
4279
4280          For integer N, X and Y when X > 0:
4281
4282          N * X > Y <==> N >= (Y /[floor] X) + 1.  */
4283       int outside_overhead = (vec_outside_cost
4284                               - scalar_single_iter_cost * peel_iters_prologue
4285                               - scalar_single_iter_cost * peel_iters_epilogue
4286                               - scalar_outside_cost);
4287       /* We're only interested in cases that require at least one
4288          vector iteration.  */
4289       int min_vec_niters = 1;
4290       if (outside_overhead > 0)
4291         min_vec_niters = outside_overhead / saving_per_viter + 1;
4292
4293       if (dump_enabled_p ())
4294         dump_printf (MSG_NOTE, "  Minimum number of vector iterations: %d\n",
4295                      min_vec_niters);
4296
4297       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4298         {
4299           /* Now that we know the minimum number of vector iterations,
4300              find the minimum niters for which the scalar cost is larger:
4301
4302              SIC * niters > VIC * vniters + VOC - SOC
4303
4304              We know that the minimum niters is no more than
4305              vniters * VF + NPEEL, but it might be (and often is) less
4306              than that if a partial vector iteration is cheaper than the
4307              equivalent scalar code.  */
4308           int threshold = (vec_inside_cost * min_vec_niters
4309                            + vec_outside_cost
4310                            - scalar_outside_cost);
4311           if (threshold <= 0)
4312             min_profitable_iters = 1;
4313           else
4314             min_profitable_iters = threshold / scalar_single_iter_cost + 1;
4315         }
4316       else
4317         /* Convert the number of vector iterations into a number of
4318            scalar iterations.  */
4319         min_profitable_iters = (min_vec_niters * assumed_vf
4320                                 + peel_iters_prologue
4321                                 + peel_iters_epilogue);
4322     }
4323   else
4324     {
4325       min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
4326                               * assumed_vf
4327                               - vec_inside_cost * peel_iters_prologue
4328                               - vec_inside_cost * peel_iters_epilogue);
4329       if (min_profitable_iters <= 0)
4330         min_profitable_iters = 0;
4331       else
4332         {
4333           min_profitable_iters /= saving_per_viter;
4334
4335           if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
4336               <= (((int) vec_inside_cost * min_profitable_iters)
4337                   + (((int) vec_outside_cost - scalar_outside_cost)
4338                      * assumed_vf)))
4339             min_profitable_iters++;
4340         }
4341     }
4342
4343   if (dump_enabled_p ())
4344     dump_printf (MSG_NOTE,
4345                  "  Calculated minimum iters for profitability: %d\n",
4346                  min_profitable_iters);
4347
4348   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
4349       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
4350     /* We want the vectorized loop to execute at least once.  */
4351     min_profitable_iters = assumed_vf + peel_iters_prologue;
4352   else if (min_profitable_iters < peel_iters_prologue)
4353     /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
4354        vectorized loop executes at least once.  */
4355     min_profitable_iters = peel_iters_prologue;
4356
4357   if (dump_enabled_p ())
4358     dump_printf_loc (MSG_NOTE, vect_location,
4359                      "  Runtime profitability threshold = %d\n",
4360                      min_profitable_iters);
4361
4362   *ret_min_profitable_niters = min_profitable_iters;
4363
4364   /* Calculate number of iterations required to make the vector version
4365      profitable, relative to the loop bodies only.
4366
4367      Non-vectorized variant is SIC * niters and it must win over vector
4368      variant on the expected loop trip count.  The following condition must hold true:
4369      SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC  */
4370
4371   if (vec_outside_cost <= 0)
4372     min_profitable_estimate = 0;
4373   /* ??? This "else if" arm is written to handle all cases; see below for
4374      what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
4375   else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4376     {
4377       /* This is a repeat of the code above, but with + SOC rather
4378          than - SOC.  */
4379       int outside_overhead = (vec_outside_cost
4380                               - scalar_single_iter_cost * peel_iters_prologue
4381                               - scalar_single_iter_cost * peel_iters_epilogue
4382                               + scalar_outside_cost);
4383       int min_vec_niters = 1;
4384       if (outside_overhead > 0)
4385         min_vec_niters = outside_overhead / saving_per_viter + 1;
4386
4387       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4388         {
4389           int threshold = (vec_inside_cost * min_vec_niters
4390                            + vec_outside_cost
4391                            + scalar_outside_cost);
4392           min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
4393         }
4394       else
4395         min_profitable_estimate = (min_vec_niters * assumed_vf
4396                                    + peel_iters_prologue
4397                                    + peel_iters_epilogue);
4398     }
4399   else
4400     {
4401       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
4402                                  * assumed_vf
4403                                  - vec_inside_cost * peel_iters_prologue
4404                                  - vec_inside_cost * peel_iters_epilogue)
4405                                  / ((scalar_single_iter_cost * assumed_vf)
4406                                    - vec_inside_cost);
4407     }
4408   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
4409   if (dump_enabled_p ())
4410     dump_printf_loc (MSG_NOTE, vect_location,
4411                      "  Static estimate profitability threshold = %d\n",
4412                      min_profitable_estimate);
4413
4414   *ret_min_profitable_estimate = min_profitable_estimate;
4415 }
4416
4417 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
4418    vector elements (not bits) for a vector with NELT elements.  */
4419 static void
4420 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
4421                               vec_perm_builder *sel)
4422 {
4423   /* The encoding is a single stepped pattern.  Any wrap-around is handled
4424      by vec_perm_indices.  */
4425   sel->new_vector (nelt, 1, 3);
4426   for (unsigned int i = 0; i < 3; i++)
4427     sel->quick_push (i + offset);
4428 }
4429
4430 /* Checks whether the target supports whole-vector shifts for vectors of mode
4431    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
4432    it supports vec_perm_const with masks for all necessary shift amounts.  */
4433 static bool
4434 have_whole_vector_shift (machine_mode mode)
4435 {
4436   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
4437     return true;
4438
4439   /* Variable-length vectors should be handled via the optab.  */
4440   unsigned int nelt;
4441   if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
4442     return false;
4443
4444   vec_perm_builder sel;
4445   vec_perm_indices indices;
4446   for (unsigned int i = nelt / 2; i >= 1; i /= 2)
4447     {
4448       calc_vec_perm_mask_for_shift (i, nelt, &sel);
4449       indices.new_vector (sel, 2, nelt);
4450       if (!can_vec_perm_const_p (mode, indices, false))
4451         return false;
4452     }
4453   return true;
4454 }
4455
4456 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
4457    functions. Design better to avoid maintenance issues.  */
4458
4459 /* Function vect_model_reduction_cost.
4460
4461    Models cost for a reduction operation, including the vector ops
4462    generated within the strip-mine loop in some cases, the initial
4463    definition before the loop, and the epilogue code that must be generated.  */
4464
4465 static void
4466 vect_model_reduction_cost (loop_vec_info loop_vinfo,
4467                            stmt_vec_info stmt_info, internal_fn reduc_fn,
4468                            vect_reduction_type reduction_type,
4469                            int ncopies, stmt_vector_for_cost *cost_vec)
4470 {
4471   int prologue_cost = 0, epilogue_cost = 0, inside_cost = 0;
4472   enum tree_code code;
4473   optab optab;
4474   tree vectype;
4475   machine_mode mode;
4476   class loop *loop = NULL;
4477
4478   if (loop_vinfo)
4479     loop = LOOP_VINFO_LOOP (loop_vinfo);
4480
4481   /* Condition reductions generate two reductions in the loop.  */
4482   if (reduction_type == COND_REDUCTION)
4483     ncopies *= 2;
4484
4485   vectype = STMT_VINFO_VECTYPE (stmt_info);
4486   mode = TYPE_MODE (vectype);
4487   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4488
4489   code = gimple_assign_rhs_code (orig_stmt_info->stmt);
4490
4491   if (reduction_type == EXTRACT_LAST_REDUCTION)
4492     /* No extra instructions are needed in the prologue.  The loop body
4493        operations are costed in vectorizable_condition.  */
4494     inside_cost = 0;
4495   else if (reduction_type == FOLD_LEFT_REDUCTION)
4496     {
4497       /* No extra instructions needed in the prologue.  */
4498       prologue_cost = 0;
4499
4500       if (reduc_fn != IFN_LAST)
4501         /* Count one reduction-like operation per vector.  */
4502         inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
4503                                         stmt_info, 0, vect_body);
4504       else
4505         {
4506           /* Use NELEMENTS extracts and NELEMENTS scalar ops.  */
4507           unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
4508           inside_cost = record_stmt_cost (cost_vec, nelements,
4509                                           vec_to_scalar, stmt_info, 0,
4510                                           vect_body);
4511           inside_cost += record_stmt_cost (cost_vec, nelements,
4512                                            scalar_stmt, stmt_info, 0,
4513                                            vect_body);
4514         }
4515     }
4516   else
4517     {
4518       /* Add in cost for initial definition.
4519          For cond reduction we have four vectors: initial index, step,
4520          initial result of the data reduction, initial value of the index
4521          reduction.  */
4522       int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
4523       prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
4524                                          scalar_to_vec, stmt_info, 0,
4525                                          vect_prologue);
4526     }
4527
4528   /* Determine cost of epilogue code.
4529
4530      We have a reduction operator that will reduce the vector in one statement.
4531      Also requires scalar extract.  */
4532
4533   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
4534     {
4535       if (reduc_fn != IFN_LAST)
4536         {
4537           if (reduction_type == COND_REDUCTION)
4538             {
4539               /* An EQ stmt and an COND_EXPR stmt.  */
4540               epilogue_cost += record_stmt_cost (cost_vec, 2,
4541                                                  vector_stmt, stmt_info, 0,
4542                                                  vect_epilogue);
4543               /* Reduction of the max index and a reduction of the found
4544                  values.  */
4545               epilogue_cost += record_stmt_cost (cost_vec, 2,
4546                                                  vec_to_scalar, stmt_info, 0,
4547                                                  vect_epilogue);
4548               /* A broadcast of the max value.  */
4549               epilogue_cost += record_stmt_cost (cost_vec, 1,
4550                                                  scalar_to_vec, stmt_info, 0,
4551                                                  vect_epilogue);
4552             }
4553           else
4554             {
4555               epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
4556                                                  stmt_info, 0, vect_epilogue);
4557               epilogue_cost += record_stmt_cost (cost_vec, 1,
4558                                                  vec_to_scalar, stmt_info, 0,
4559                                                  vect_epilogue);
4560             }
4561         }
4562       else if (reduction_type == COND_REDUCTION)
4563         {
4564           unsigned estimated_nunits = vect_nunits_for_cost (vectype);
4565           /* Extraction of scalar elements.  */
4566           epilogue_cost += record_stmt_cost (cost_vec,
4567                                              2 * estimated_nunits,
4568                                              vec_to_scalar, stmt_info, 0,
4569                                              vect_epilogue);
4570           /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
4571           epilogue_cost += record_stmt_cost (cost_vec,
4572                                              2 * estimated_nunits - 3,
4573                                              scalar_stmt, stmt_info, 0,
4574                                              vect_epilogue);
4575         }
4576       else if (reduction_type == EXTRACT_LAST_REDUCTION
4577                || reduction_type == FOLD_LEFT_REDUCTION)
4578         /* No extra instructions need in the epilogue.  */
4579         ;
4580       else
4581         {
4582           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4583           tree bitsize =
4584             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt_info->stmt)));
4585           int element_bitsize = tree_to_uhwi (bitsize);
4586           int nelements = vec_size_in_bits / element_bitsize;
4587
4588           if (code == COND_EXPR)
4589             code = MAX_EXPR;
4590
4591           optab = optab_for_tree_code (code, vectype, optab_default);
4592
4593           /* We have a whole vector shift available.  */
4594           if (optab != unknown_optab
4595               && VECTOR_MODE_P (mode)
4596               && optab_handler (optab, mode) != CODE_FOR_nothing
4597               && have_whole_vector_shift (mode))
4598             {
4599               /* Final reduction via vector shifts and the reduction operator.
4600                  Also requires scalar extract.  */
4601               epilogue_cost += record_stmt_cost (cost_vec,
4602                                                  exact_log2 (nelements) * 2,
4603                                                  vector_stmt, stmt_info, 0,
4604                                                  vect_epilogue);
4605               epilogue_cost += record_stmt_cost (cost_vec, 1,
4606                                                  vec_to_scalar, stmt_info, 0,
4607                                                  vect_epilogue);
4608             }
4609           else
4610             /* Use extracts and reduction op for final reduction.  For N
4611                elements, we have N extracts and N-1 reduction ops.  */
4612             epilogue_cost += record_stmt_cost (cost_vec,
4613                                                nelements + nelements - 1,
4614                                                vector_stmt, stmt_info, 0,
4615                                                vect_epilogue);
4616         }
4617     }
4618
4619   if (dump_enabled_p ())
4620     dump_printf (MSG_NOTE,
4621                  "vect_model_reduction_cost: inside_cost = %d, "
4622                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4623                  prologue_cost, epilogue_cost);
4624 }
4625
4626 /* SEQ is a sequence of instructions that initialize the reduction
4627    described by REDUC_INFO.  Emit them in the appropriate place.  */
4628
4629 static void
4630 vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo,
4631                                 stmt_vec_info reduc_info, gimple *seq)
4632 {
4633   if (reduc_info->reused_accumulator)
4634     {
4635       /* When reusing an accumulator from the main loop, we only need
4636          initialization instructions if the main loop can be skipped.
4637          In that case, emit the initialization instructions at the end
4638          of the guard block that does the skip.  */
4639       edge skip_edge = loop_vinfo->skip_main_loop_edge;
4640       gcc_assert (skip_edge);
4641       gimple_stmt_iterator gsi = gsi_last_bb (skip_edge->src);
4642       gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
4643     }
4644   else
4645     {
4646       /* The normal case: emit the initialization instructions on the
4647          preheader edge.  */
4648       class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4649       gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), seq);
4650     }
4651 }
4652
4653 /* Function get_initial_def_for_reduction
4654
4655    Input:
4656    REDUC_INFO - the info_for_reduction
4657    INIT_VAL - the initial value of the reduction variable
4658    NEUTRAL_OP - a value that has no effect on the reduction, as per
4659                 neutral_op_for_reduction
4660
4661    Output:
4662    Return a vector variable, initialized according to the operation that
4663         STMT_VINFO performs. This vector will be used as the initial value
4664         of the vector of partial results.
4665
4666    The value we need is a vector in which element 0 has value INIT_VAL
4667    and every other element has value NEUTRAL_OP.  */
4668
4669 static tree
4670 get_initial_def_for_reduction (loop_vec_info loop_vinfo,
4671                                stmt_vec_info reduc_info,
4672                                tree init_val, tree neutral_op)
4673 {
4674   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4675   tree scalar_type = TREE_TYPE (init_val);
4676   tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
4677   tree init_def;
4678   gimple_seq stmts = NULL;
4679
4680   gcc_assert (vectype);
4681
4682   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4683               || SCALAR_FLOAT_TYPE_P (scalar_type));
4684
4685   gcc_assert (nested_in_vect_loop_p (loop, reduc_info)
4686               || loop == (gimple_bb (reduc_info->stmt))->loop_father);
4687
4688   if (operand_equal_p (init_val, neutral_op))
4689     {
4690       /* If both elements are equal then the vector described above is
4691          just a splat.  */
4692       neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
4693       init_def = gimple_build_vector_from_val (&stmts, vectype, neutral_op);
4694     }
4695   else
4696     {
4697       neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
4698       init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4699       if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4700         {
4701           /* Construct a splat of NEUTRAL_OP and insert INIT_VAL into
4702              element 0.  */
4703           init_def = gimple_build_vector_from_val (&stmts, vectype,
4704                                                    neutral_op);
4705           init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4706                                    vectype, init_def, init_val);
4707         }
4708       else
4709         {
4710           /* Build {INIT_VAL, NEUTRAL_OP, NEUTRAL_OP, ...}.  */
4711           tree_vector_builder elts (vectype, 1, 2);
4712           elts.quick_push (init_val);
4713           elts.quick_push (neutral_op);
4714           init_def = gimple_build_vector (&stmts, &elts);
4715         }
4716     }
4717
4718   if (stmts)
4719     vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, stmts);
4720   return init_def;
4721 }
4722
4723 /* Get at the initial defs for the reduction PHIs for REDUC_INFO,
4724    which performs a reduction involving GROUP_SIZE scalar statements.
4725    NUMBER_OF_VECTORS is the number of vector defs to create.  If NEUTRAL_OP
4726    is nonnull, introducing extra elements of that value will not change the
4727    result.  */
4728
4729 static void
4730 get_initial_defs_for_reduction (loop_vec_info loop_vinfo,
4731                                 stmt_vec_info reduc_info,
4732                                 vec<tree> *vec_oprnds,
4733                                 unsigned int number_of_vectors,
4734                                 unsigned int group_size, tree neutral_op)
4735 {
4736   vec<tree> &initial_values = reduc_info->reduc_initial_values;
4737   unsigned HOST_WIDE_INT nunits;
4738   unsigned j, number_of_places_left_in_vector;
4739   tree vector_type = STMT_VINFO_VECTYPE (reduc_info);
4740   unsigned int i;
4741
4742   gcc_assert (group_size == initial_values.length () || neutral_op);
4743
4744   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4745      created vectors. It is greater than 1 if unrolling is performed.
4746
4747      For example, we have two scalar operands, s1 and s2 (e.g., group of
4748      strided accesses of size two), while NUNITS is four (i.e., four scalars
4749      of this type can be packed in a vector).  The output vector will contain
4750      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
4751      will be 2).
4752
4753      If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4754      vectors containing the operands.
4755
4756      For example, NUNITS is four as before, and the group size is 8
4757      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
4758      {s5, s6, s7, s8}.  */
4759
4760   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4761     nunits = group_size;
4762
4763   number_of_places_left_in_vector = nunits;
4764   bool constant_p = true;
4765   tree_vector_builder elts (vector_type, nunits, 1);
4766   elts.quick_grow (nunits);
4767   gimple_seq ctor_seq = NULL;
4768   for (j = 0; j < nunits * number_of_vectors; ++j)
4769     {
4770       tree op;
4771       i = j % group_size;
4772
4773       /* Get the def before the loop.  In reduction chain we have only
4774          one initial value.  Else we have as many as PHIs in the group.  */
4775       if (i >= initial_values.length () || (j > i && neutral_op))
4776         op = neutral_op;
4777       else
4778         op = initial_values[i];
4779
4780       /* Create 'vect_ = {op0,op1,...,opn}'.  */
4781       number_of_places_left_in_vector--;
4782       elts[nunits - number_of_places_left_in_vector - 1] = op;
4783       if (!CONSTANT_CLASS_P (op))
4784         constant_p = false;
4785
4786       if (number_of_places_left_in_vector == 0)
4787         {
4788           tree init;
4789           if (constant_p && !neutral_op
4790               ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4791               : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4792             /* Build the vector directly from ELTS.  */
4793             init = gimple_build_vector (&ctor_seq, &elts);
4794           else if (neutral_op)
4795             {
4796               /* Build a vector of the neutral value and shift the
4797                  other elements into place.  */
4798               init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4799                                                    neutral_op);
4800               int k = nunits;
4801               while (k > 0 && elts[k - 1] == neutral_op)
4802                 k -= 1;
4803               while (k > 0)
4804                 {
4805                   k -= 1;
4806                   init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4807                                        vector_type, init, elts[k]);
4808                 }
4809             }
4810           else
4811             {
4812               /* First time round, duplicate ELTS to fill the
4813                  required number of vectors.  */
4814               duplicate_and_interleave (loop_vinfo, &ctor_seq, vector_type,
4815                                         elts, number_of_vectors, *vec_oprnds);
4816               break;
4817             }
4818           vec_oprnds->quick_push (init);
4819
4820           number_of_places_left_in_vector = nunits;
4821           elts.new_vector (vector_type, nunits, 1);
4822           elts.quick_grow (nunits);
4823           constant_p = true;
4824         }
4825     }
4826   if (ctor_seq != NULL)
4827     vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, ctor_seq);
4828 }
4829
4830 /* For a statement STMT_INFO taking part in a reduction operation return
4831    the stmt_vec_info the meta information is stored on.  */
4832
4833 stmt_vec_info
4834 info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
4835 {
4836   stmt_info = vect_orig_stmt (stmt_info);
4837   gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
4838   if (!is_a <gphi *> (stmt_info->stmt)
4839       || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
4840     stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4841   gphi *phi = as_a <gphi *> (stmt_info->stmt);
4842   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
4843     {
4844       if (gimple_phi_num_args (phi) == 1)
4845         stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4846     }
4847   else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
4848     {
4849       stmt_vec_info info = vinfo->lookup_def (vect_phi_initial_value (phi));
4850       if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
4851         stmt_info = info;
4852     }
4853   return stmt_info;
4854 }
4855
4856 /* See if LOOP_VINFO is an epilogue loop whose main loop had a reduction that
4857    REDUC_INFO can build on.  Adjust REDUC_INFO and return true if so, otherwise
4858    return false.  */
4859
4860 static bool
4861 vect_find_reusable_accumulator (loop_vec_info loop_vinfo,
4862                                 stmt_vec_info reduc_info)
4863 {
4864   loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
4865   if (!main_loop_vinfo)
4866     return false;
4867
4868   if (STMT_VINFO_REDUC_TYPE (reduc_info) != TREE_CODE_REDUCTION)
4869     return false;
4870
4871   unsigned int num_phis = reduc_info->reduc_initial_values.length ();
4872   auto_vec<tree, 16> main_loop_results (num_phis);
4873   auto_vec<tree, 16> initial_values (num_phis);
4874   if (edge main_loop_edge = loop_vinfo->main_loop_edge)
4875     {
4876       /* The epilogue loop can be entered either from the main loop or
4877          from an earlier guard block.  */
4878       edge skip_edge = loop_vinfo->skip_main_loop_edge;
4879       for (tree incoming_value : reduc_info->reduc_initial_values)
4880         {
4881           /* Look for:
4882
4883                INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop),
4884                                     INITIAL_VALUE(guard block)>.  */
4885           gcc_assert (TREE_CODE (incoming_value) == SSA_NAME);
4886
4887           gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value));
4888           gcc_assert (gimple_bb (phi) == main_loop_edge->dest);
4889
4890           tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge);
4891           tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge);
4892
4893           main_loop_results.quick_push (from_main_loop);
4894           initial_values.quick_push (from_skip);
4895         }
4896     }
4897   else
4898     /* The main loop dominates the epilogue loop.  */
4899     main_loop_results.splice (reduc_info->reduc_initial_values);
4900
4901   /* See if the main loop has the kind of accumulator we need.  */
4902   vect_reusable_accumulator *accumulator
4903     = main_loop_vinfo->reusable_accumulators.get (main_loop_results[0]);
4904   if (!accumulator
4905       || num_phis != accumulator->reduc_info->reduc_scalar_results.length ()
4906       || !std::equal (main_loop_results.begin (), main_loop_results.end (),
4907                       accumulator->reduc_info->reduc_scalar_results.begin ()))
4908     return false;
4909
4910   /* Handle the case where we can reduce wider vectors to narrower ones.  */
4911   tree vectype = STMT_VINFO_VECTYPE (reduc_info);
4912   tree old_vectype = TREE_TYPE (accumulator->reduc_input);
4913   if (!constant_multiple_p (TYPE_VECTOR_SUBPARTS (old_vectype),
4914                             TYPE_VECTOR_SUBPARTS (vectype)))
4915     return false;
4916
4917   /* Non-SLP reductions might apply an adjustment after the reduction
4918      operation, in order to simplify the initialization of the accumulator.
4919      If the epilogue loop carries on from where the main loop left off,
4920      it should apply the same adjustment to the final reduction result.
4921
4922      If the epilogue loop can also be entered directly (rather than via
4923      the main loop), we need to be able to handle that case in the same way,
4924      with the same adjustment.  (In principle we could add a PHI node
4925      to select the correct adjustment, but in practice that shouldn't be
4926      necessary.)  */
4927   tree main_adjustment
4928     = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (accumulator->reduc_info);
4929   if (loop_vinfo->main_loop_edge && main_adjustment)
4930     {
4931       gcc_assert (num_phis == 1);
4932       tree initial_value = initial_values[0];
4933       /* Check that we can use INITIAL_VALUE as the adjustment and
4934          initialize the accumulator with a neutral value instead.  */
4935       if (!operand_equal_p (initial_value, main_adjustment))
4936         return false;
4937       tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
4938       initial_values[0] = neutral_op_for_reduction (TREE_TYPE (initial_value),
4939                                                     code, initial_value);
4940     }
4941   STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = main_adjustment;
4942   reduc_info->reduc_initial_values.truncate (0);
4943   reduc_info->reduc_initial_values.splice (initial_values);
4944   reduc_info->reused_accumulator = accumulator;
4945   return true;
4946 }
4947
4948 /* Reduce the vector VEC_DEF down to VECTYPE with reduction operation
4949    CODE emitting stmts before GSI.  Returns a vector def of VECTYPE.  */
4950
4951 static tree
4952 vect_create_partial_epilog (tree vec_def, tree vectype, enum tree_code code,
4953                             gimple_seq *seq)
4954 {
4955   unsigned nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (vec_def)).to_constant ();
4956   unsigned nunits1 = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
4957   tree stype = TREE_TYPE (vectype);
4958   tree new_temp = vec_def;
4959   while (nunits > nunits1)
4960     {
4961       nunits /= 2;
4962       tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
4963                                                            stype, nunits);
4964       unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
4965
4966       /* The target has to make sure we support lowpart/highpart
4967          extraction, either via direct vector extract or through
4968          an integer mode punning.  */
4969       tree dst1, dst2;
4970       gimple *epilog_stmt;
4971       if (convert_optab_handler (vec_extract_optab,
4972                                  TYPE_MODE (TREE_TYPE (new_temp)),
4973                                  TYPE_MODE (vectype1))
4974           != CODE_FOR_nothing)
4975         {
4976           /* Extract sub-vectors directly once vec_extract becomes
4977              a conversion optab.  */
4978           dst1 = make_ssa_name (vectype1);
4979           epilog_stmt
4980               = gimple_build_assign (dst1, BIT_FIELD_REF,
4981                                      build3 (BIT_FIELD_REF, vectype1,
4982                                              new_temp, TYPE_SIZE (vectype1),
4983                                              bitsize_int (0)));
4984           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
4985           dst2 =  make_ssa_name (vectype1);
4986           epilog_stmt
4987               = gimple_build_assign (dst2, BIT_FIELD_REF,
4988                                      build3 (BIT_FIELD_REF, vectype1,
4989                                              new_temp, TYPE_SIZE (vectype1),
4990                                              bitsize_int (bitsize)));
4991           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
4992         }
4993       else
4994         {
4995           /* Extract via punning to appropriately sized integer mode
4996              vector.  */
4997           tree eltype = build_nonstandard_integer_type (bitsize, 1);
4998           tree etype = build_vector_type (eltype, 2);
4999           gcc_assert (convert_optab_handler (vec_extract_optab,
5000                                              TYPE_MODE (etype),
5001                                              TYPE_MODE (eltype))
5002                       != CODE_FOR_nothing);
5003           tree tem = make_ssa_name (etype);
5004           epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5005                                              build1 (VIEW_CONVERT_EXPR,
5006                                                      etype, new_temp));
5007           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5008           new_temp = tem;
5009           tem = make_ssa_name (eltype);
5010           epilog_stmt
5011               = gimple_build_assign (tem, BIT_FIELD_REF,
5012                                      build3 (BIT_FIELD_REF, eltype,
5013                                              new_temp, TYPE_SIZE (eltype),
5014                                              bitsize_int (0)));
5015           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5016           dst1 = make_ssa_name (vectype1);
5017           epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5018                                              build1 (VIEW_CONVERT_EXPR,
5019                                                      vectype1, tem));
5020           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5021           tem = make_ssa_name (eltype);
5022           epilog_stmt
5023               = gimple_build_assign (tem, BIT_FIELD_REF,
5024                                      build3 (BIT_FIELD_REF, eltype,
5025                                              new_temp, TYPE_SIZE (eltype),
5026                                              bitsize_int (bitsize)));
5027           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5028           dst2 =  make_ssa_name (vectype1);
5029           epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5030                                              build1 (VIEW_CONVERT_EXPR,
5031                                                      vectype1, tem));
5032           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5033         }
5034
5035       new_temp = make_ssa_name (vectype1);
5036       epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5037       gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5038     }
5039
5040   return new_temp;
5041 }
5042
5043 /* Function vect_create_epilog_for_reduction
5044
5045    Create code at the loop-epilog to finalize the result of a reduction
5046    computation.
5047
5048    STMT_INFO is the scalar reduction stmt that is being vectorized.
5049    SLP_NODE is an SLP node containing a group of reduction statements. The
5050      first one in this group is STMT_INFO.
5051    SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
5052    REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
5053      (counting from 0)
5054
5055    This function:
5056    1. Completes the reduction def-use cycles.
5057    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
5058       by calling the function specified by REDUC_FN if available, or by
5059       other means (whole-vector shifts or a scalar loop).
5060       The function also creates a new phi node at the loop exit to preserve
5061       loop-closed form, as illustrated below.
5062
5063      The flow at the entry to this function:
5064
5065         loop:
5066           vec_def = phi <vec_init, null>        # REDUCTION_PHI
5067           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
5068           s_loop = scalar_stmt                  # (scalar) STMT_INFO
5069         loop_exit:
5070           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5071           use <s_out0>
5072           use <s_out0>
5073
5074      The above is transformed by this function into:
5075
5076         loop:
5077           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
5078           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
5079           s_loop = scalar_stmt                  # (scalar) STMT_INFO
5080         loop_exit:
5081           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5082           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5083           v_out2 = reduce <v_out1>
5084           s_out3 = extract_field <v_out2, 0>
5085           s_out4 = adjust_result <s_out3>
5086           use <s_out4>
5087           use <s_out4>
5088 */
5089
5090 static void
5091 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
5092                                   stmt_vec_info stmt_info,
5093                                   slp_tree slp_node,
5094                                   slp_instance slp_node_instance)
5095 {
5096   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5097   gcc_assert (reduc_info->is_reduc_info);
5098   /* For double reductions we need to get at the inner loop reduction
5099      stmt which has the meta info attached.  Our stmt_info is that of the
5100      loop-closed PHI of the inner loop which we remember as
5101      def for the reduction PHI generation.  */
5102   bool double_reduc = false;
5103   stmt_vec_info rdef_info = stmt_info;
5104   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5105     {
5106       gcc_assert (!slp_node);
5107       double_reduc = true;
5108       stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
5109                                             (stmt_info->stmt, 0));
5110       stmt_info = vect_stmt_to_vectorize (stmt_info);
5111     }
5112   gphi *reduc_def_stmt
5113     = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
5114   enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
5115   internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
5116   tree vectype;
5117   machine_mode mode;
5118   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
5119   basic_block exit_bb;
5120   tree scalar_dest;
5121   tree scalar_type;
5122   gimple *new_phi = NULL, *phi;
5123   gimple_stmt_iterator exit_gsi;
5124   tree new_temp = NULL_TREE, new_name, new_scalar_dest;
5125   gimple *epilog_stmt = NULL;
5126   gimple *exit_phi;
5127   tree bitsize;
5128   tree def;
5129   tree orig_name, scalar_result;
5130   imm_use_iterator imm_iter, phi_imm_iter;
5131   use_operand_p use_p, phi_use_p;
5132   gimple *use_stmt;
5133   auto_vec<tree> reduc_inputs;
5134   int j, i;
5135   vec<tree> &scalar_results = reduc_info->reduc_scalar_results;
5136   unsigned int group_size = 1, k;
5137   auto_vec<gimple *> phis;
5138   /* SLP reduction without reduction chain, e.g.,
5139      # a1 = phi <a2, a0>
5140      # b1 = phi <b2, b0>
5141      a2 = operation (a1)
5142      b2 = operation (b1)  */
5143   bool slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
5144   bool direct_slp_reduc;
5145   tree induction_index = NULL_TREE;
5146
5147   if (slp_node)
5148     group_size = SLP_TREE_LANES (slp_node);
5149
5150   if (nested_in_vect_loop_p (loop, stmt_info))
5151     {
5152       outer_loop = loop;
5153       loop = loop->inner;
5154       gcc_assert (!slp_node && double_reduc);
5155     }
5156
5157   vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
5158   gcc_assert (vectype);
5159   mode = TYPE_MODE (vectype);
5160
5161   tree induc_val = NULL_TREE;
5162   tree adjustment_def = NULL;
5163   if (slp_node)
5164     ;
5165   else
5166     {
5167       /* Optimize: for induction condition reduction, if we can't use zero
5168          for induc_val, use initial_def.  */
5169       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5170         induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
5171       else if (double_reduc)
5172         ;
5173       else
5174         adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
5175     }
5176
5177   stmt_vec_info single_live_out_stmt[] = { stmt_info };
5178   array_slice<const stmt_vec_info> live_out_stmts = single_live_out_stmt;
5179   if (slp_reduc)
5180     /* All statements produce live-out values.  */
5181     live_out_stmts = SLP_TREE_SCALAR_STMTS (slp_node);
5182   else if (slp_node)
5183     /* The last statement in the reduction chain produces the live-out
5184        value.  */
5185     single_live_out_stmt[0] = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5186
5187   unsigned vec_num;
5188   int ncopies;
5189   if (slp_node)
5190     {
5191       vec_num = SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis).length ();
5192       ncopies = 1;
5193     }
5194   else
5195     {
5196       stmt_vec_info reduc_info = loop_vinfo->lookup_stmt (reduc_def_stmt);
5197       vec_num = 1;
5198       ncopies = STMT_VINFO_VEC_STMTS (reduc_info).length ();
5199     }
5200
5201   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
5202      which is updated with the current index of the loop for every match of
5203      the original loop's cond_expr (VEC_STMT).  This results in a vector
5204      containing the last time the condition passed for that vector lane.
5205      The first match will be a 1 to allow 0 to be used for non-matching
5206      indexes.  If there are no matches at all then the vector will be all
5207      zeroes.
5208
5209      PR92772: This algorithm is broken for architectures that support
5210      masked vectors, but do not provide fold_extract_last.  */
5211   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
5212     {
5213       auto_vec<std::pair<tree, bool>, 2> ccompares;
5214       stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
5215       cond_info = vect_stmt_to_vectorize (cond_info);
5216       while (cond_info != reduc_info)
5217         {
5218           if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
5219             {
5220               gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
5221               gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
5222               ccompares.safe_push
5223                 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
5224                                  STMT_VINFO_REDUC_IDX (cond_info) == 2));
5225             }
5226           cond_info
5227             = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
5228                                                  1 + STMT_VINFO_REDUC_IDX
5229                                                         (cond_info)));
5230           cond_info = vect_stmt_to_vectorize (cond_info);
5231         }
5232       gcc_assert (ccompares.length () != 0);
5233
5234       tree indx_before_incr, indx_after_incr;
5235       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
5236       int scalar_precision
5237         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
5238       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
5239       tree cr_index_vector_type = get_related_vectype_for_scalar_type
5240         (TYPE_MODE (vectype), cr_index_scalar_type,
5241          TYPE_VECTOR_SUBPARTS (vectype));
5242
5243       /* First we create a simple vector induction variable which starts
5244          with the values {1,2,3,...} (SERIES_VECT) and increments by the
5245          vector size (STEP).  */
5246
5247       /* Create a {1,2,3,...} vector.  */
5248       tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
5249
5250       /* Create a vector of the step value.  */
5251       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
5252       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
5253
5254       /* Create an induction variable.  */
5255       gimple_stmt_iterator incr_gsi;
5256       bool insert_after;
5257       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
5258       create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
5259                  insert_after, &indx_before_incr, &indx_after_incr);
5260
5261       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
5262          filled with zeros (VEC_ZERO).  */
5263
5264       /* Create a vector of 0s.  */
5265       tree zero = build_zero_cst (cr_index_scalar_type);
5266       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
5267
5268       /* Create a vector phi node.  */
5269       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
5270       new_phi = create_phi_node (new_phi_tree, loop->header);
5271       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
5272                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
5273
5274       /* Now take the condition from the loops original cond_exprs
5275          and produce a new cond_exprs (INDEX_COND_EXPR) which for
5276          every match uses values from the induction variable
5277          (INDEX_BEFORE_INCR) otherwise uses values from the phi node
5278          (NEW_PHI_TREE).
5279          Finally, we update the phi (NEW_PHI_TREE) to take the value of
5280          the new cond_expr (INDEX_COND_EXPR).  */
5281       gimple_seq stmts = NULL;
5282       for (int i = ccompares.length () - 1; i != -1; --i)
5283         {
5284           tree ccompare = ccompares[i].first;
5285           if (ccompares[i].second)
5286             new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5287                                          cr_index_vector_type,
5288                                          ccompare,
5289                                          indx_before_incr, new_phi_tree);
5290           else
5291             new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5292                                          cr_index_vector_type,
5293                                          ccompare,
5294                                          new_phi_tree, indx_before_incr);
5295         }
5296       gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
5297
5298       /* Update the phi with the vec cond.  */
5299       induction_index = new_phi_tree;
5300       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
5301                    loop_latch_edge (loop), UNKNOWN_LOCATION);
5302     }
5303
5304   /* 2. Create epilog code.
5305         The reduction epilog code operates across the elements of the vector
5306         of partial results computed by the vectorized loop.
5307         The reduction epilog code consists of:
5308
5309         step 1: compute the scalar result in a vector (v_out2)
5310         step 2: extract the scalar result (s_out3) from the vector (v_out2)
5311         step 3: adjust the scalar result (s_out3) if needed.
5312
5313         Step 1 can be accomplished using one the following three schemes:
5314           (scheme 1) using reduc_fn, if available.
5315           (scheme 2) using whole-vector shifts, if available.
5316           (scheme 3) using a scalar loop. In this case steps 1+2 above are
5317                      combined.
5318
5319           The overall epilog code looks like this:
5320
5321           s_out0 = phi <s_loop>         # original EXIT_PHI
5322           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
5323           v_out2 = reduce <v_out1>              # step 1
5324           s_out3 = extract_field <v_out2, 0>    # step 2
5325           s_out4 = adjust_result <s_out3>       # step 3
5326
5327           (step 3 is optional, and steps 1 and 2 may be combined).
5328           Lastly, the uses of s_out0 are replaced by s_out4.  */
5329
5330
5331   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
5332          v_out1 = phi <VECT_DEF>
5333          Store them in NEW_PHIS.  */
5334   if (double_reduc)
5335     loop = outer_loop;
5336   exit_bb = single_exit (loop)->dest;
5337   exit_gsi = gsi_after_labels (exit_bb);
5338   reduc_inputs.create (slp_node ? vec_num : ncopies);
5339   for (unsigned i = 0; i < vec_num; i++)
5340     {
5341       gimple_seq stmts = NULL;
5342       if (slp_node)
5343         def = vect_get_slp_vect_def (slp_node, i);
5344       else
5345         def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]);
5346       for (j = 0; j < ncopies; j++)
5347         {
5348           tree new_def = copy_ssa_name (def);
5349           phi = create_phi_node (new_def, exit_bb);
5350           if (j)
5351             def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
5352           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
5353           new_def = gimple_convert (&stmts, vectype, new_def);
5354           reduc_inputs.quick_push (new_def);
5355         }
5356       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5357     }
5358
5359   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
5360          (i.e. when reduc_fn is not available) and in the final adjustment
5361          code (if needed).  Also get the original scalar reduction variable as
5362          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
5363          represents a reduction pattern), the tree-code and scalar-def are
5364          taken from the original stmt that the pattern-stmt (STMT) replaces.
5365          Otherwise (it is a regular reduction) - the tree-code and scalar-def
5366          are taken from STMT.  */
5367
5368   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
5369   if (orig_stmt_info != stmt_info)
5370     {
5371       /* Reduction pattern  */
5372       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
5373       gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
5374     }
5375
5376   scalar_dest = gimple_get_lhs (orig_stmt_info->stmt);
5377   scalar_type = TREE_TYPE (scalar_dest);
5378   scalar_results.create (group_size);
5379   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
5380   bitsize = TYPE_SIZE (scalar_type);
5381
5382   /* True if we should implement SLP_REDUC using native reduction operations
5383      instead of scalar operations.  */
5384   direct_slp_reduc = (reduc_fn != IFN_LAST
5385                       && slp_reduc
5386                       && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
5387
5388   /* In case of reduction chain, e.g.,
5389      # a1 = phi <a3, a0>
5390      a2 = operation (a1)
5391      a3 = operation (a2),
5392
5393      we may end up with more than one vector result.  Here we reduce them
5394      to one vector.
5395
5396      The same is true if we couldn't use a single defuse cycle.  */
5397   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)
5398       || direct_slp_reduc
5399       || ncopies > 1)
5400     {
5401       gimple_seq stmts = NULL;
5402       tree single_input = reduc_inputs[0];
5403       for (k = 1; k < reduc_inputs.length (); k++)
5404         single_input = gimple_build (&stmts, code, vectype,
5405                                      single_input, reduc_inputs[k]);
5406       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5407
5408       reduc_inputs.truncate (0);
5409       reduc_inputs.safe_push (single_input);
5410     }
5411
5412   tree orig_reduc_input = reduc_inputs[0];
5413
5414   /* If this loop is an epilogue loop that can be skipped after the
5415      main loop, we can only share a reduction operation between the
5416      main loop and the epilogue if we put it at the target of the
5417      skip edge.
5418
5419      We can still reuse accumulators if this check fails.  Doing so has
5420      the minor(?) benefit of making the epilogue loop's scalar result
5421      independent of the main loop's scalar result.  */
5422   bool unify_with_main_loop_p = false;
5423   if (reduc_info->reused_accumulator
5424       && loop_vinfo->skip_this_loop_edge
5425       && single_succ_p (exit_bb)
5426       && single_succ (exit_bb) == loop_vinfo->skip_this_loop_edge->dest)
5427     {
5428       unify_with_main_loop_p = true;
5429
5430       basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest;
5431       reduc_inputs[0] = make_ssa_name (vectype);
5432       gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block);
5433       add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (exit_bb),
5434                    UNKNOWN_LOCATION);
5435       add_phi_arg (new_phi, reduc_info->reused_accumulator->reduc_input,
5436                    loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION);
5437       exit_gsi = gsi_after_labels (reduc_block);
5438     }
5439
5440   /* Shouldn't be used beyond this point.  */
5441   exit_bb = nullptr;
5442
5443   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
5444       && reduc_fn != IFN_LAST)
5445     {
5446       /* For condition reductions, we have a vector (REDUC_INPUTS 0) containing
5447          various data values where the condition matched and another vector
5448          (INDUCTION_INDEX) containing all the indexes of those matches.  We
5449          need to extract the last matching index (which will be the index with
5450          highest value) and use this to index into the data vector.
5451          For the case where there were no matches, the data vector will contain
5452          all default values and the index vector will be all zeros.  */
5453
5454       /* Get various versions of the type of the vector of indexes.  */
5455       tree index_vec_type = TREE_TYPE (induction_index);
5456       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
5457       tree index_scalar_type = TREE_TYPE (index_vec_type);
5458       tree index_vec_cmp_type = truth_type_for (index_vec_type);
5459
5460       /* Get an unsigned integer version of the type of the data vector.  */
5461       int scalar_precision
5462         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
5463       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
5464       tree vectype_unsigned = get_same_sized_vectype (scalar_type_unsigned,
5465                                                 vectype);
5466
5467       /* First we need to create a vector (ZERO_VEC) of zeros and another
5468          vector (MAX_INDEX_VEC) filled with the last matching index, which we
5469          can create using a MAX reduction and then expanding.
5470          In the case where the loop never made any matches, the max index will
5471          be zero.  */
5472
5473       /* Vector of {0, 0, 0,...}.  */
5474       tree zero_vec = build_zero_cst (vectype);
5475
5476       /* Find maximum value from the vector of found indexes.  */
5477       tree max_index = make_ssa_name (index_scalar_type);
5478       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5479                                                           1, induction_index);
5480       gimple_call_set_lhs (max_index_stmt, max_index);
5481       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
5482
5483       /* Vector of {max_index, max_index, max_index,...}.  */
5484       tree max_index_vec = make_ssa_name (index_vec_type);
5485       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
5486                                                       max_index);
5487       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
5488                                                         max_index_vec_rhs);
5489       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
5490
5491       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
5492          with the vector (INDUCTION_INDEX) of found indexes, choosing values
5493          from the data vector (REDUC_INPUTS 0) for matches, 0 (ZERO_VEC)
5494          otherwise.  Only one value should match, resulting in a vector
5495          (VEC_COND) with one data value and the rest zeros.
5496          In the case where the loop never made any matches, every index will
5497          match, resulting in a vector with all data values (which will all be
5498          the default value).  */
5499
5500       /* Compare the max index vector to the vector of found indexes to find
5501          the position of the max value.  */
5502       tree vec_compare = make_ssa_name (index_vec_cmp_type);
5503       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
5504                                                       induction_index,
5505                                                       max_index_vec);
5506       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
5507
5508       /* Use the compare to choose either values from the data vector or
5509          zero.  */
5510       tree vec_cond = make_ssa_name (vectype);
5511       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
5512                                                    vec_compare,
5513                                                    reduc_inputs[0],
5514                                                    zero_vec);
5515       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
5516
5517       /* Finally we need to extract the data value from the vector (VEC_COND)
5518          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
5519          reduction, but because this doesn't exist, we can use a MAX reduction
5520          instead.  The data value might be signed or a float so we need to cast
5521          it first.
5522          In the case where the loop never made any matches, the data values are
5523          all identical, and so will reduce down correctly.  */
5524
5525       /* Make the matched data values unsigned.  */
5526       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
5527       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
5528                                        vec_cond);
5529       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
5530                                                         VIEW_CONVERT_EXPR,
5531                                                         vec_cond_cast_rhs);
5532       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
5533
5534       /* Reduce down to a scalar value.  */
5535       tree data_reduc = make_ssa_name (scalar_type_unsigned);
5536       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5537                                                            1, vec_cond_cast);
5538       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
5539       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
5540
5541       /* Convert the reduced value back to the result type and set as the
5542          result.  */
5543       gimple_seq stmts = NULL;
5544       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
5545                                data_reduc);
5546       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5547       scalar_results.safe_push (new_temp);
5548     }
5549   else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
5550            && reduc_fn == IFN_LAST)
5551     {
5552       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
5553          idx = 0;
5554          idx_val = induction_index[0];
5555          val = data_reduc[0];
5556          for (idx = 0, val = init, i = 0; i < nelts; ++i)
5557            if (induction_index[i] > idx_val)
5558              val = data_reduc[i], idx_val = induction_index[i];
5559          return val;  */
5560
5561       tree data_eltype = TREE_TYPE (vectype);
5562       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5563       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5564       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5565       /* Enforced by vectorizable_reduction, which ensures we have target
5566          support before allowing a conditional reduction on variable-length
5567          vectors.  */
5568       unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5569       tree idx_val = NULL_TREE, val = NULL_TREE;
5570       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5571         {
5572           tree old_idx_val = idx_val;
5573           tree old_val = val;
5574           idx_val = make_ssa_name (idx_eltype);
5575           epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5576                                              build3 (BIT_FIELD_REF, idx_eltype,
5577                                                      induction_index,
5578                                                      bitsize_int (el_size),
5579                                                      bitsize_int (off)));
5580           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5581           val = make_ssa_name (data_eltype);
5582           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5583                                              build3 (BIT_FIELD_REF,
5584                                                      data_eltype,
5585                                                      reduc_inputs[0],
5586                                                      bitsize_int (el_size),
5587                                                      bitsize_int (off)));
5588           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5589           if (off != 0)
5590             {
5591               tree new_idx_val = idx_val;
5592               if (off != v_size - el_size)
5593                 {
5594                   new_idx_val = make_ssa_name (idx_eltype);
5595                   epilog_stmt = gimple_build_assign (new_idx_val,
5596                                                      MAX_EXPR, idx_val,
5597                                                      old_idx_val);
5598                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5599                 }
5600               tree new_val = make_ssa_name (data_eltype);
5601               epilog_stmt = gimple_build_assign (new_val,
5602                                                  COND_EXPR,
5603                                                  build2 (GT_EXPR,
5604                                                          boolean_type_node,
5605                                                          idx_val,
5606                                                          old_idx_val),
5607                                                  val, old_val);
5608               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5609               idx_val = new_idx_val;
5610               val = new_val;
5611             }
5612         }
5613       /* Convert the reduced value back to the result type and set as the
5614          result.  */
5615       gimple_seq stmts = NULL;
5616       val = gimple_convert (&stmts, scalar_type, val);
5617       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5618       scalar_results.safe_push (val);
5619     }
5620
5621   /* 2.3 Create the reduction code, using one of the three schemes described
5622          above. In SLP we simply need to extract all the elements from the
5623          vector (without reducing them), so we use scalar shifts.  */
5624   else if (reduc_fn != IFN_LAST && !slp_reduc)
5625     {
5626       tree tmp;
5627       tree vec_elem_type;
5628
5629       /* Case 1:  Create:
5630          v_out2 = reduc_expr <v_out1>  */
5631
5632       if (dump_enabled_p ())
5633         dump_printf_loc (MSG_NOTE, vect_location,
5634                          "Reduce using direct vector reduction.\n");
5635
5636       gimple_seq stmts = NULL;
5637       vec_elem_type = TREE_TYPE (vectype);
5638       new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
5639                                vec_elem_type, reduc_inputs[0]);
5640       new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5641       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5642
5643       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5644           && induc_val)
5645         {
5646           /* Earlier we set the initial value to be a vector if induc_val
5647              values.  Check the result and if it is induc_val then replace
5648              with the original initial value, unless induc_val is
5649              the same as initial_def already.  */
5650           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5651                                   induc_val);
5652           tree initial_def = reduc_info->reduc_initial_values[0];
5653
5654           tmp = make_ssa_name (new_scalar_dest);
5655           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5656                                              initial_def, new_temp);
5657           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5658           new_temp = tmp;
5659         }
5660
5661       scalar_results.safe_push (new_temp);
5662     }
5663   else if (direct_slp_reduc)
5664     {
5665       /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5666          with the elements for other SLP statements replaced with the
5667          neutral value.  We can then do a normal reduction on each vector.  */
5668
5669       /* Enforced by vectorizable_reduction.  */
5670       gcc_assert (reduc_inputs.length () == 1);
5671       gcc_assert (pow2p_hwi (group_size));
5672
5673       gimple_seq seq = NULL;
5674
5675       /* Build a vector {0, 1, 2, ...}, with the same number of elements
5676          and the same element size as VECTYPE.  */
5677       tree index = build_index_vector (vectype, 0, 1);
5678       tree index_type = TREE_TYPE (index);
5679       tree index_elt_type = TREE_TYPE (index_type);
5680       tree mask_type = truth_type_for (index_type);
5681
5682       /* Create a vector that, for each element, identifies which of
5683          the REDUC_GROUP_SIZE results should use it.  */
5684       tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5685       index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5686                             build_vector_from_val (index_type, index_mask));
5687
5688       /* Get a neutral vector value.  This is simply a splat of the neutral
5689          scalar value if we have one, otherwise the initial scalar value
5690          is itself a neutral value.  */
5691       tree vector_identity = NULL_TREE;
5692       tree neutral_op = NULL_TREE;
5693       if (slp_node)
5694         {
5695           tree initial_value = NULL_TREE;
5696           if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5697             initial_value = reduc_info->reduc_initial_values[0];
5698           neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
5699                                                  initial_value);
5700         }
5701       if (neutral_op)
5702         vector_identity = gimple_build_vector_from_val (&seq, vectype,
5703                                                         neutral_op);
5704       for (unsigned int i = 0; i < group_size; ++i)
5705         {
5706           /* If there's no univeral neutral value, we can use the
5707              initial scalar value from the original PHI.  This is used
5708              for MIN and MAX reduction, for example.  */
5709           if (!neutral_op)
5710             {
5711               tree scalar_value = reduc_info->reduc_initial_values[i];
5712               scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
5713                                              scalar_value);
5714               vector_identity = gimple_build_vector_from_val (&seq, vectype,
5715                                                               scalar_value);
5716             }
5717
5718           /* Calculate the equivalent of:
5719
5720              sel[j] = (index[j] == i);
5721
5722              which selects the elements of REDUC_INPUTS[0] that should
5723              be included in the result.  */
5724           tree compare_val = build_int_cst (index_elt_type, i);
5725           compare_val = build_vector_from_val (index_type, compare_val);
5726           tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5727                                    index, compare_val);
5728
5729           /* Calculate the equivalent of:
5730
5731              vec = seq ? reduc_inputs[0] : vector_identity;
5732
5733              VEC is now suitable for a full vector reduction.  */
5734           tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5735                                    sel, reduc_inputs[0], vector_identity);
5736
5737           /* Do the reduction and convert it to the appropriate type.  */
5738           tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5739                                       TREE_TYPE (vectype), vec);
5740           scalar = gimple_convert (&seq, scalar_type, scalar);
5741           scalar_results.safe_push (scalar);
5742         }
5743       gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5744     }
5745   else
5746     {
5747       bool reduce_with_shift;
5748       tree vec_temp;
5749
5750       gcc_assert (slp_reduc || reduc_inputs.length () == 1);
5751
5752       /* See if the target wants to do the final (shift) reduction
5753          in a vector mode of smaller size and first reduce upper/lower
5754          halves against each other.  */
5755       enum machine_mode mode1 = mode;
5756       tree stype = TREE_TYPE (vectype);
5757       unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5758       unsigned nunits1 = nunits;
5759       if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
5760           && reduc_inputs.length () == 1)
5761         {
5762           nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5763           /* For SLP reductions we have to make sure lanes match up, but
5764              since we're doing individual element final reduction reducing
5765              vector width here is even more important.
5766              ???  We can also separate lanes with permutes, for the common
5767              case of power-of-two group-size odd/even extracts would work.  */
5768           if (slp_reduc && nunits != nunits1)
5769             {
5770               nunits1 = least_common_multiple (nunits1, group_size);
5771               gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
5772             }
5773         }
5774       if (!slp_reduc
5775           && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5776         nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5777
5778       tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5779                                                            stype, nunits1);
5780       reduce_with_shift = have_whole_vector_shift (mode1);
5781       if (!VECTOR_MODE_P (mode1))
5782         reduce_with_shift = false;
5783       else
5784         {
5785           optab optab = optab_for_tree_code (code, vectype1, optab_default);
5786           if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5787             reduce_with_shift = false;
5788         }
5789
5790       /* First reduce the vector to the desired vector size we should
5791          do shift reduction on by combining upper and lower halves.  */
5792       gimple_seq stmts = NULL;
5793       new_temp = vect_create_partial_epilog (reduc_inputs[0], vectype1,
5794                                              code, &stmts);
5795       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5796       reduc_inputs[0] = new_temp;
5797
5798       if (reduce_with_shift && !slp_reduc)
5799         {
5800           int element_bitsize = tree_to_uhwi (bitsize);
5801           /* Enforced by vectorizable_reduction, which disallows SLP reductions
5802              for variable-length vectors and also requires direct target support
5803              for loop reductions.  */
5804           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5805           int nelements = vec_size_in_bits / element_bitsize;
5806           vec_perm_builder sel;
5807           vec_perm_indices indices;
5808
5809           int elt_offset;
5810
5811           tree zero_vec = build_zero_cst (vectype1);
5812           /* Case 2: Create:
5813              for (offset = nelements/2; offset >= 1; offset/=2)
5814                 {
5815                   Create:  va' = vec_shift <va, offset>
5816                   Create:  va = vop <va, va'>
5817                 }  */
5818
5819           tree rhs;
5820
5821           if (dump_enabled_p ())
5822             dump_printf_loc (MSG_NOTE, vect_location,
5823                              "Reduce using vector shifts\n");
5824
5825           gimple_seq stmts = NULL;
5826           new_temp = gimple_convert (&stmts, vectype1, new_temp);
5827           for (elt_offset = nelements / 2;
5828                elt_offset >= 1;
5829                elt_offset /= 2)
5830             {
5831               calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5832               indices.new_vector (sel, 2, nelements);
5833               tree mask = vect_gen_perm_mask_any (vectype1, indices);
5834               new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
5835                                        new_temp, zero_vec, mask);
5836               new_temp = gimple_build (&stmts, code,
5837                                        vectype1, new_name, new_temp);
5838             }
5839           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5840
5841           /* 2.4  Extract the final scalar result.  Create:
5842              s_out3 = extract_field <v_out2, bitpos>  */
5843
5844           if (dump_enabled_p ())
5845             dump_printf_loc (MSG_NOTE, vect_location,
5846                              "extract scalar result\n");
5847
5848           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5849                         bitsize, bitsize_zero_node);
5850           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5851           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5852           gimple_assign_set_lhs (epilog_stmt, new_temp);
5853           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5854           scalar_results.safe_push (new_temp);
5855         }
5856       else
5857         {
5858           /* Case 3: Create:
5859              s = extract_field <v_out2, 0>
5860              for (offset = element_size;
5861                   offset < vector_size;
5862                   offset += element_size;)
5863                {
5864                  Create:  s' = extract_field <v_out2, offset>
5865                  Create:  s = op <s, s'>  // For non SLP cases
5866                }  */
5867
5868           if (dump_enabled_p ())
5869             dump_printf_loc (MSG_NOTE, vect_location,
5870                              "Reduce using scalar code.\n");
5871
5872           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5873           int element_bitsize = tree_to_uhwi (bitsize);
5874           tree compute_type = TREE_TYPE (vectype);
5875           gimple_seq stmts = NULL;
5876           FOR_EACH_VEC_ELT (reduc_inputs, i, vec_temp)
5877             {
5878               int bit_offset;
5879               new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
5880                                        vec_temp, bitsize, bitsize_zero_node);
5881
5882               /* In SLP we don't need to apply reduction operation, so we just
5883                  collect s' values in SCALAR_RESULTS.  */
5884               if (slp_reduc)
5885                 scalar_results.safe_push (new_temp);
5886
5887               for (bit_offset = element_bitsize;
5888                    bit_offset < vec_size_in_bits;
5889                    bit_offset += element_bitsize)
5890                 {
5891                   tree bitpos = bitsize_int (bit_offset);
5892                   new_name = gimple_build (&stmts, BIT_FIELD_REF,
5893                                            compute_type, vec_temp,
5894                                            bitsize, bitpos);
5895                   if (slp_reduc)
5896                     {
5897                       /* In SLP we don't need to apply reduction operation, so
5898                          we just collect s' values in SCALAR_RESULTS.  */
5899                       new_temp = new_name;
5900                       scalar_results.safe_push (new_name);
5901                     }
5902                   else
5903                     new_temp = gimple_build (&stmts, code, compute_type,
5904                                              new_name, new_temp);
5905                 }
5906             }
5907
5908           /* The only case where we need to reduce scalar results in SLP, is
5909              unrolling.  If the size of SCALAR_RESULTS is greater than
5910              REDUC_GROUP_SIZE, we reduce them combining elements modulo
5911              REDUC_GROUP_SIZE.  */
5912           if (slp_reduc)
5913             {
5914               tree res, first_res, new_res;
5915
5916               /* Reduce multiple scalar results in case of SLP unrolling.  */
5917               for (j = group_size; scalar_results.iterate (j, &res);
5918                    j++)
5919                 {
5920                   first_res = scalar_results[j % group_size];
5921                   new_res = gimple_build (&stmts, code, compute_type,
5922                                           first_res, res);
5923                   scalar_results[j % group_size] = new_res;
5924                 }
5925               scalar_results.truncate (group_size);
5926               for (k = 0; k < group_size; k++)
5927                 scalar_results[k] = gimple_convert (&stmts, scalar_type,
5928                                                     scalar_results[k]);
5929             }
5930           else
5931             {
5932               /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
5933               new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5934               scalar_results.safe_push (new_temp);
5935             }
5936
5937           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5938         }
5939
5940       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5941           && induc_val)
5942         {
5943           /* Earlier we set the initial value to be a vector if induc_val
5944              values.  Check the result and if it is induc_val then replace
5945              with the original initial value, unless induc_val is
5946              the same as initial_def already.  */
5947           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5948                                   induc_val);
5949           tree initial_def = reduc_info->reduc_initial_values[0];
5950
5951           tree tmp = make_ssa_name (new_scalar_dest);
5952           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5953                                              initial_def, new_temp);
5954           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5955           scalar_results[0] = tmp;
5956         }
5957     }
5958
5959   /* 2.5 Adjust the final result by the initial value of the reduction
5960          variable. (When such adjustment is not needed, then
5961          'adjustment_def' is zero).  For example, if code is PLUS we create:
5962          new_temp = loop_exit_def + adjustment_def  */
5963
5964   if (adjustment_def)
5965     {
5966       gcc_assert (!slp_reduc);
5967       gimple_seq stmts = NULL;
5968       if (double_reduc)
5969         {
5970           gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
5971           adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
5972           new_temp = gimple_build (&stmts, code, vectype,
5973                                    reduc_inputs[0], adjustment_def);
5974         }
5975       else
5976         {
5977           new_temp = scalar_results[0];
5978           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5979           adjustment_def = gimple_convert (&stmts, scalar_type, adjustment_def);
5980           new_temp = gimple_build (&stmts, code, scalar_type,
5981                                    new_temp, adjustment_def);
5982         }
5983
5984       epilog_stmt = gimple_seq_last_stmt (stmts);
5985       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5986       scalar_results[0] = new_temp;
5987     }
5988
5989   /* Record this operation if it could be reused by the epilogue loop.  */
5990   if (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION)
5991     loop_vinfo->reusable_accumulators.put (scalar_results[0],
5992                                            { orig_reduc_input, reduc_info });
5993
5994   if (double_reduc)
5995     loop = outer_loop;
5996
5997   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
5998           phis with new adjusted scalar results, i.e., replace use <s_out0>
5999           with use <s_out4>.
6000
6001      Transform:
6002         loop_exit:
6003           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
6004           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
6005           v_out2 = reduce <v_out1>
6006           s_out3 = extract_field <v_out2, 0>
6007           s_out4 = adjust_result <s_out3>
6008           use <s_out0>
6009           use <s_out0>
6010
6011      into:
6012
6013         loop_exit:
6014           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
6015           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
6016           v_out2 = reduce <v_out1>
6017           s_out3 = extract_field <v_out2, 0>
6018           s_out4 = adjust_result <s_out3>
6019           use <s_out4>
6020           use <s_out4> */
6021
6022   gcc_assert (live_out_stmts.size () == scalar_results.length ());
6023   for (k = 0; k < live_out_stmts.size (); k++)
6024     {
6025       stmt_vec_info scalar_stmt_info = vect_orig_stmt (live_out_stmts[k]);
6026       scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt);
6027
6028       phis.create (3);
6029       /* Find the loop-closed-use at the loop exit of the original scalar
6030          result.  (The reduction result is expected to have two immediate uses,
6031          one at the latch block, and one at the loop exit).  For double
6032          reductions we are looking for exit phis of the outer loop.  */
6033       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6034         {
6035           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6036             {
6037               if (!is_gimple_debug (USE_STMT (use_p)))
6038                 phis.safe_push (USE_STMT (use_p));
6039             }
6040           else
6041             {
6042               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
6043                 {
6044                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
6045
6046                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
6047                     {
6048                       if (!flow_bb_inside_loop_p (loop,
6049                                              gimple_bb (USE_STMT (phi_use_p)))
6050                           && !is_gimple_debug (USE_STMT (phi_use_p)))
6051                         phis.safe_push (USE_STMT (phi_use_p));
6052                     }
6053                 }
6054             }
6055         }
6056
6057       FOR_EACH_VEC_ELT (phis, i, exit_phi)
6058         {
6059           /* Replace the uses:  */
6060           orig_name = PHI_RESULT (exit_phi);
6061
6062           /* Look for a single use at the target of the skip edge.  */
6063           if (unify_with_main_loop_p)
6064             {
6065               use_operand_p use_p;
6066               gimple *user;
6067               if (!single_imm_use (orig_name, &use_p, &user))
6068                 gcc_unreachable ();
6069               orig_name = gimple_get_lhs (user);
6070             }
6071
6072           scalar_result = scalar_results[k];
6073           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6074             {
6075               FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6076                 SET_USE (use_p, scalar_result);
6077               update_stmt (use_stmt);
6078             }
6079         }
6080
6081       phis.release ();
6082     }
6083 }
6084
6085 /* Return a vector of type VECTYPE that is equal to the vector select
6086    operation "MASK ? VEC : IDENTITY".  Insert the select statements
6087    before GSI.  */
6088
6089 static tree
6090 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
6091                      tree vec, tree identity)
6092 {
6093   tree cond = make_temp_ssa_name (vectype, NULL, "cond");
6094   gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
6095                                           mask, vec, identity);
6096   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6097   return cond;
6098 }
6099
6100 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6101    order, starting with LHS.  Insert the extraction statements before GSI and
6102    associate the new scalar SSA names with variable SCALAR_DEST.
6103    Return the SSA name for the result.  */
6104
6105 static tree
6106 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
6107                        tree_code code, tree lhs, tree vector_rhs)
6108 {
6109   tree vectype = TREE_TYPE (vector_rhs);
6110   tree scalar_type = TREE_TYPE (vectype);
6111   tree bitsize = TYPE_SIZE (scalar_type);
6112   unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
6113   unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
6114
6115   for (unsigned HOST_WIDE_INT bit_offset = 0;
6116        bit_offset < vec_size_in_bits;
6117        bit_offset += element_bitsize)
6118     {
6119       tree bitpos = bitsize_int (bit_offset);
6120       tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
6121                          bitsize, bitpos);
6122
6123       gassign *stmt = gimple_build_assign (scalar_dest, rhs);
6124       rhs = make_ssa_name (scalar_dest, stmt);
6125       gimple_assign_set_lhs (stmt, rhs);
6126       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6127
6128       stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
6129       tree new_name = make_ssa_name (scalar_dest, stmt);
6130       gimple_assign_set_lhs (stmt, new_name);
6131       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6132       lhs = new_name;
6133     }
6134   return lhs;
6135 }
6136
6137 /* Get a masked internal function equivalent to REDUC_FN.  VECTYPE_IN is the
6138    type of the vector input.  */
6139
6140 static internal_fn
6141 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
6142 {
6143   internal_fn mask_reduc_fn;
6144
6145   switch (reduc_fn)
6146     {
6147     case IFN_FOLD_LEFT_PLUS:
6148       mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
6149       break;
6150
6151     default:
6152       return IFN_LAST;
6153     }
6154
6155   if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
6156                                       OPTIMIZE_FOR_SPEED))
6157     return mask_reduc_fn;
6158   return IFN_LAST;
6159 }
6160
6161 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT_INFO is the
6162    statement that sets the live-out value.  REDUC_DEF_STMT is the phi
6163    statement.  CODE is the operation performed by STMT_INFO and OPS are
6164    its scalar operands.  REDUC_INDEX is the index of the operand in
6165    OPS that is set by REDUC_DEF_STMT.  REDUC_FN is the function that
6166    implements in-order reduction, or IFN_LAST if we should open-code it.
6167    VECTYPE_IN is the type of the vector input.  MASKS specifies the masks
6168    that should be used to control the operation in a fully-masked loop.  */
6169
6170 static bool
6171 vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
6172                                stmt_vec_info stmt_info,
6173                                gimple_stmt_iterator *gsi,
6174                                gimple **vec_stmt, slp_tree slp_node,
6175                                gimple *reduc_def_stmt,
6176                                tree_code code, internal_fn reduc_fn,
6177                                tree ops[3], tree vectype_in,
6178                                int reduc_index, vec_loop_masks *masks)
6179 {
6180   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6181   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6182   internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
6183
6184   int ncopies;
6185   if (slp_node)
6186     ncopies = 1;
6187   else
6188     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6189
6190   gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
6191   gcc_assert (ncopies == 1);
6192   gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
6193
6194   if (slp_node)
6195     gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
6196                           TYPE_VECTOR_SUBPARTS (vectype_in)));
6197
6198   tree op0 = ops[1 - reduc_index];
6199
6200   int group_size = 1;
6201   stmt_vec_info scalar_dest_def_info;
6202   auto_vec<tree> vec_oprnds0;
6203   if (slp_node)
6204     {
6205       auto_vec<vec<tree> > vec_defs (2);
6206       vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs);
6207       vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
6208       vec_defs[0].release ();
6209       vec_defs[1].release ();
6210       group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6211       scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
6212     }
6213   else
6214     {
6215       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
6216                                      op0, &vec_oprnds0);
6217       scalar_dest_def_info = stmt_info;
6218     }
6219
6220   tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
6221   tree scalar_type = TREE_TYPE (scalar_dest);
6222   tree reduc_var = gimple_phi_result (reduc_def_stmt);
6223
6224   int vec_num = vec_oprnds0.length ();
6225   gcc_assert (vec_num == 1 || slp_node);
6226   tree vec_elem_type = TREE_TYPE (vectype_out);
6227   gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
6228
6229   tree vector_identity = NULL_TREE;
6230   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6231     vector_identity = build_zero_cst (vectype_out);
6232
6233   tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
6234   int i;
6235   tree def0;
6236   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6237     {
6238       gimple *new_stmt;
6239       tree mask = NULL_TREE;
6240       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6241         mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
6242
6243       /* Handle MINUS by adding the negative.  */
6244       if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
6245         {
6246           tree negated = make_ssa_name (vectype_out);
6247           new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
6248           gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6249           def0 = negated;
6250         }
6251
6252       if (mask && mask_reduc_fn == IFN_LAST)
6253         def0 = merge_with_identity (gsi, mask, vectype_out, def0,
6254                                     vector_identity);
6255
6256       /* On the first iteration the input is simply the scalar phi
6257          result, and for subsequent iterations it is the output of
6258          the preceding operation.  */
6259       if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
6260         {
6261           if (mask && mask_reduc_fn != IFN_LAST)
6262             new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
6263                                                    def0, mask);
6264           else
6265             new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
6266                                                    def0);
6267           /* For chained SLP reductions the output of the previous reduction
6268              operation serves as the input of the next. For the final statement
6269              the output cannot be a temporary - we reuse the original
6270              scalar destination of the last statement.  */
6271           if (i != vec_num - 1)
6272             {
6273               gimple_set_lhs (new_stmt, scalar_dest_var);
6274               reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
6275               gimple_set_lhs (new_stmt, reduc_var);
6276             }
6277         }
6278       else
6279         {
6280           reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
6281                                              reduc_var, def0);
6282           new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6283           /* Remove the statement, so that we can use the same code paths
6284              as for statements that we've just created.  */
6285           gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6286           gsi_remove (&tmp_gsi, true);
6287         }
6288
6289       if (i == vec_num - 1)
6290         {
6291           gimple_set_lhs (new_stmt, scalar_dest);
6292           vect_finish_replace_stmt (loop_vinfo,
6293                                     scalar_dest_def_info,
6294                                     new_stmt);
6295         }
6296       else
6297         vect_finish_stmt_generation (loop_vinfo,
6298                                      scalar_dest_def_info,
6299                                      new_stmt, gsi);
6300
6301       if (slp_node)
6302         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6303       else
6304         {
6305           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
6306           *vec_stmt = new_stmt;
6307         }
6308     }
6309
6310   return true;
6311 }
6312
6313 /* Function is_nonwrapping_integer_induction.
6314
6315    Check if STMT_VINO (which is part of loop LOOP) both increments and
6316    does not cause overflow.  */
6317
6318 static bool
6319 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
6320 {
6321   gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
6322   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
6323   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
6324   tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
6325   widest_int ni, max_loop_value, lhs_max;
6326   wi::overflow_type overflow = wi::OVF_NONE;
6327
6328   /* Make sure the loop is integer based.  */
6329   if (TREE_CODE (base) != INTEGER_CST
6330       || TREE_CODE (step) != INTEGER_CST)
6331     return false;
6332
6333   /* Check that the max size of the loop will not wrap.  */
6334
6335   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6336     return true;
6337
6338   if (! max_stmt_executions (loop, &ni))
6339     return false;
6340
6341   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6342                             &overflow);
6343   if (overflow)
6344     return false;
6345
6346   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6347                             TYPE_SIGN (lhs_type), &overflow);
6348   if (overflow)
6349     return false;
6350
6351   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6352           <= TYPE_PRECISION (lhs_type));
6353 }
6354
6355 /* Check if masking can be supported by inserting a conditional expression.
6356    CODE is the code for the operation.  COND_FN is the conditional internal
6357    function, if it exists.  VECTYPE_IN is the type of the vector input.  */
6358 static bool
6359 use_mask_by_cond_expr_p (enum tree_code code, internal_fn cond_fn,
6360                          tree vectype_in)
6361 {
6362   if (cond_fn != IFN_LAST
6363       && direct_internal_fn_supported_p (cond_fn, vectype_in,
6364                                          OPTIMIZE_FOR_SPEED))
6365     return false;
6366
6367   switch (code)
6368     {
6369     case DOT_PROD_EXPR:
6370     case SAD_EXPR:
6371       return true;
6372
6373     default:
6374       return false;
6375     }
6376 }
6377
6378 /* Insert a conditional expression to enable masked vectorization.  CODE is the
6379    code for the operation.  VOP is the array of operands.  MASK is the loop
6380    mask.  GSI is a statement iterator used to place the new conditional
6381    expression.  */
6382 static void
6383 build_vect_cond_expr (enum tree_code code, tree vop[3], tree mask,
6384                       gimple_stmt_iterator *gsi)
6385 {
6386   switch (code)
6387     {
6388     case DOT_PROD_EXPR:
6389       {
6390         tree vectype = TREE_TYPE (vop[1]);
6391         tree zero = build_zero_cst (vectype);
6392         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6393         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6394                                                mask, vop[1], zero);
6395         gsi_insert_before (gsi, select, GSI_SAME_STMT);
6396         vop[1] = masked_op1;
6397         break;
6398       }
6399
6400     case SAD_EXPR:
6401       {
6402         tree vectype = TREE_TYPE (vop[1]);
6403         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6404         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6405                                                mask, vop[1], vop[0]);
6406         gsi_insert_before (gsi, select, GSI_SAME_STMT);
6407         vop[1] = masked_op1;
6408         break;
6409       }
6410
6411     default:
6412       gcc_unreachable ();
6413     }
6414 }
6415
6416 /* Function vectorizable_reduction.
6417
6418    Check if STMT_INFO performs a reduction operation that can be vectorized.
6419    If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
6420    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6421    Return true if STMT_INFO is vectorizable in this way.
6422
6423    This function also handles reduction idioms (patterns) that have been
6424    recognized in advance during vect_pattern_recog.  In this case, STMT_INFO
6425    may be of this form:
6426      X = pattern_expr (arg0, arg1, ..., X)
6427    and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
6428    sequence that had been detected and replaced by the pattern-stmt
6429    (STMT_INFO).
6430
6431    This function also handles reduction of condition expressions, for example:
6432      for (int i = 0; i < N; i++)
6433        if (a[i] < value)
6434          last = a[i];
6435    This is handled by vectorising the loop and creating an additional vector
6436    containing the loop indexes for which "a[i] < value" was true.  In the
6437    function epilogue this is reduced to a single max value and then used to
6438    index into the vector of results.
6439
6440    In some cases of reduction patterns, the type of the reduction variable X is
6441    different than the type of the other arguments of STMT_INFO.
6442    In such cases, the vectype that is used when transforming STMT_INFO into
6443    a vector stmt is different than the vectype that is used to determine the
6444    vectorization factor, because it consists of a different number of elements
6445    than the actual number of elements that are being operated upon in parallel.
6446
6447    For example, consider an accumulation of shorts into an int accumulator.
6448    On some targets it's possible to vectorize this pattern operating on 8
6449    shorts at a time (hence, the vectype for purposes of determining the
6450    vectorization factor should be V8HI); on the other hand, the vectype that
6451    is used to create the vector form is actually V4SI (the type of the result).
6452
6453    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6454    indicates what is the actual level of parallelism (V8HI in the example), so
6455    that the right vectorization factor would be derived.  This vectype
6456    corresponds to the type of arguments to the reduction stmt, and should *NOT*
6457    be used to create the vectorized stmt.  The right vectype for the vectorized
6458    stmt is obtained from the type of the result X:
6459       get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6460
6461    This means that, contrary to "regular" reductions (or "regular" stmts in
6462    general), the following equation:
6463       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6464    does *NOT* necessarily hold for reduction patterns.  */
6465
6466 bool
6467 vectorizable_reduction (loop_vec_info loop_vinfo,
6468                         stmt_vec_info stmt_info, slp_tree slp_node,
6469                         slp_instance slp_node_instance,
6470                         stmt_vector_for_cost *cost_vec)
6471 {
6472   tree scalar_dest;
6473   tree vectype_in = NULL_TREE;
6474   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6475   enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
6476   stmt_vec_info cond_stmt_vinfo = NULL;
6477   tree scalar_type;
6478   int i;
6479   int ncopies;
6480   bool single_defuse_cycle = false;
6481   bool nested_cycle = false;
6482   bool double_reduc = false;
6483   int vec_num;
6484   tree tem;
6485   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6486   tree cond_reduc_val = NULL_TREE;
6487
6488   /* Make sure it was already recognized as a reduction computation.  */
6489   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
6490       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
6491       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
6492     return false;
6493
6494   /* The stmt we store reduction analysis meta on.  */
6495   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
6496   reduc_info->is_reduc_info = true;
6497
6498   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
6499     {
6500       if (is_a <gphi *> (stmt_info->stmt))
6501         {
6502           if (slp_node)
6503             {
6504               /* We eventually need to set a vector type on invariant
6505                  arguments.  */
6506               unsigned j;
6507               slp_tree child;
6508               FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
6509                 if (!vect_maybe_update_slp_op_vectype
6510                        (child, SLP_TREE_VECTYPE (slp_node)))
6511                   {
6512                     if (dump_enabled_p ())
6513                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6514                                        "incompatible vector types for "
6515                                        "invariants\n");
6516                     return false;
6517                   }
6518             }
6519           /* Analysis for double-reduction is done on the outer
6520              loop PHI, nested cycles have no further restrictions.  */
6521           STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
6522         }
6523       else
6524         STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6525       return true;
6526     }
6527
6528   stmt_vec_info orig_stmt_of_analysis = stmt_info;
6529   stmt_vec_info phi_info = stmt_info;
6530   if (!is_a <gphi *> (stmt_info->stmt))
6531     {
6532       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6533       return true;
6534     }
6535   if (slp_node)
6536     {
6537       slp_node_instance->reduc_phis = slp_node;
6538       /* ???  We're leaving slp_node to point to the PHIs, we only
6539          need it to get at the number of vector stmts which wasn't
6540          yet initialized for the instance root.  */
6541     }
6542   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
6543     stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (stmt_info));
6544   else
6545     {
6546       gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info)
6547                   == vect_double_reduction_def);
6548       use_operand_p use_p;
6549       gimple *use_stmt;
6550       bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
6551                                  &use_p, &use_stmt);
6552       gcc_assert (res);
6553       phi_info = loop_vinfo->lookup_stmt (use_stmt);
6554       stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
6555     }
6556
6557   /* PHIs should not participate in patterns.  */
6558   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6559   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
6560
6561   /* Verify following REDUC_IDX from the latch def leads us back to the PHI
6562      and compute the reduction chain length.  Discover the real
6563      reduction operation stmt on the way (stmt_info and slp_for_stmt_info).  */
6564   tree reduc_def
6565     = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
6566                              loop_latch_edge
6567                                (gimple_bb (reduc_def_phi)->loop_father));
6568   unsigned reduc_chain_length = 0;
6569   bool only_slp_reduc_chain = true;
6570   stmt_info = NULL;
6571   slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
6572   while (reduc_def != PHI_RESULT (reduc_def_phi))
6573     {
6574       stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
6575       stmt_vec_info vdef = vect_stmt_to_vectorize (def);
6576       if (STMT_VINFO_REDUC_IDX (vdef) == -1)
6577         {
6578           if (dump_enabled_p ())
6579             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6580                              "reduction chain broken by patterns.\n");
6581           return false;
6582         }
6583       if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
6584         only_slp_reduc_chain = false;
6585       /* ???  For epilogue generation live members of the chain need
6586          to point back to the PHI via their original stmt for
6587          info_for_reduction to work.  */
6588       if (STMT_VINFO_LIVE_P (vdef))
6589         STMT_VINFO_REDUC_DEF (def) = phi_info;
6590       gassign *assign = dyn_cast <gassign *> (vdef->stmt);
6591       if (!assign)
6592         {
6593           if (dump_enabled_p ())
6594             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6595                              "reduction chain includes calls.\n");
6596           return false;
6597         }
6598       if (CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign)))
6599         {
6600           if (!tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (assign)),
6601                                       TREE_TYPE (gimple_assign_rhs1 (assign))))
6602             {
6603               if (dump_enabled_p ())
6604                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6605                                  "conversion in the reduction chain.\n");
6606               return false;
6607             }
6608         }
6609       else if (!stmt_info)
6610         /* First non-conversion stmt.  */
6611         stmt_info = vdef;
6612       reduc_def = gimple_op (vdef->stmt, 1 + STMT_VINFO_REDUC_IDX (vdef));
6613       reduc_chain_length++;
6614       if (!stmt_info && slp_node)
6615         slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
6616     }
6617   /* PHIs should not participate in patterns.  */
6618   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6619
6620   if (nested_in_vect_loop_p (loop, stmt_info))
6621     {
6622       loop = loop->inner;
6623       nested_cycle = true;
6624     }
6625
6626   /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
6627      element.  */
6628   if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6629     {
6630       gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
6631       stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
6632     }
6633   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6634     gcc_assert (slp_node
6635                 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
6636
6637   /* 1. Is vectorizable reduction?  */
6638   /* Not supportable if the reduction variable is used in the loop, unless
6639      it's a reduction chain.  */
6640   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6641       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6642     return false;
6643
6644   /* Reductions that are not used even in an enclosing outer-loop,
6645      are expected to be "live" (used out of the loop).  */
6646   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6647       && !STMT_VINFO_LIVE_P (stmt_info))
6648     return false;
6649
6650   /* 2. Has this been recognized as a reduction pattern?
6651
6652      Check if STMT represents a pattern that has been recognized
6653      in earlier analysis stages.  For stmts that represent a pattern,
6654      the STMT_VINFO_RELATED_STMT field records the last stmt in
6655      the original sequence that constitutes the pattern.  */
6656
6657   stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6658   if (orig_stmt_info)
6659     {
6660       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6661       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6662     }
6663
6664   /* 3. Check the operands of the operation.  The first operands are defined
6665         inside the loop body. The last operand is the reduction variable,
6666         which is defined by the loop-header-phi.  */
6667
6668   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6669   STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
6670   gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6671   enum tree_code code = gimple_assign_rhs_code (stmt);
6672   bool lane_reduc_code_p
6673     = (code == DOT_PROD_EXPR || code == WIDEN_SUM_EXPR || code == SAD_EXPR);
6674   int op_type = TREE_CODE_LENGTH (code);
6675   enum optab_subtype optab_query_kind = optab_vector;
6676   if (code == DOT_PROD_EXPR
6677       && TYPE_SIGN (TREE_TYPE (gimple_assign_rhs1 (stmt)))
6678            != TYPE_SIGN (TREE_TYPE (gimple_assign_rhs2 (stmt))))
6679     optab_query_kind = optab_vector_mixed_sign;
6680
6681
6682   scalar_dest = gimple_assign_lhs (stmt);
6683   scalar_type = TREE_TYPE (scalar_dest);
6684   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6685       && !SCALAR_FLOAT_TYPE_P (scalar_type))
6686     return false;
6687
6688   /* Do not try to vectorize bit-precision reductions.  */
6689   if (!type_has_mode_precision_p (scalar_type))
6690     return false;
6691
6692   /* For lane-reducing ops we're reducing the number of reduction PHIs
6693      which means the only use of that may be in the lane-reducing operation.  */
6694   if (lane_reduc_code_p
6695       && reduc_chain_length != 1
6696       && !only_slp_reduc_chain)
6697     {
6698       if (dump_enabled_p ())
6699         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6700                          "lane-reducing reduction with extra stmts.\n");
6701       return false;
6702     }
6703
6704   /* All uses but the last are expected to be defined in the loop.
6705      The last use is the reduction variable.  In case of nested cycle this
6706      assumption is not true: we use reduc_index to record the index of the
6707      reduction variable.  */
6708   slp_tree *slp_op = XALLOCAVEC (slp_tree, op_type);
6709   /* We need to skip an extra operand for COND_EXPRs with embedded
6710      comparison.  */
6711   unsigned opno_adjust = 0;
6712   if (code == COND_EXPR
6713       && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt)))
6714     opno_adjust = 1;
6715   for (i = 0; i < op_type; i++)
6716     {
6717       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
6718       if (i == 0 && code == COND_EXPR)
6719         continue;
6720
6721       stmt_vec_info def_stmt_info;
6722       enum vect_def_type dt;
6723       tree op;
6724       if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info,
6725                                i + opno_adjust, &op, &slp_op[i], &dt, &tem,
6726                                &def_stmt_info))
6727         {
6728           if (dump_enabled_p ())
6729             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6730                              "use not simple.\n");
6731           return false;
6732         }
6733       if (i == STMT_VINFO_REDUC_IDX (stmt_info))
6734         continue;
6735
6736       /* There should be only one cycle def in the stmt, the one
6737          leading to reduc_def.  */
6738       if (VECTORIZABLE_CYCLE_DEF (dt))
6739         return false;
6740
6741       /* To properly compute ncopies we are interested in the widest
6742          non-reduction input type in case we're looking at a widening
6743          accumulation that we later handle in vect_transform_reduction.  */
6744       if (lane_reduc_code_p
6745           && tem
6746           && (!vectype_in
6747               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6748                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem))))))
6749         vectype_in = tem;
6750
6751       if (code == COND_EXPR)
6752         {
6753           /* Record how the non-reduction-def value of COND_EXPR is defined.  */
6754           if (dt == vect_constant_def)
6755             {
6756               cond_reduc_dt = dt;
6757               cond_reduc_val = op;
6758             }
6759           if (dt == vect_induction_def
6760               && def_stmt_info
6761               && is_nonwrapping_integer_induction (def_stmt_info, loop))
6762             {
6763               cond_reduc_dt = dt;
6764               cond_stmt_vinfo = def_stmt_info;
6765             }
6766         }
6767     }
6768   if (!vectype_in)
6769     vectype_in = STMT_VINFO_VECTYPE (phi_info);
6770   STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
6771
6772   enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
6773   STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
6774   /* If we have a condition reduction, see if we can simplify it further.  */
6775   if (v_reduc_type == COND_REDUCTION)
6776     {
6777       if (slp_node)
6778         return false;
6779
6780       /* When the condition uses the reduction value in the condition, fail.  */
6781       if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
6782         {
6783           if (dump_enabled_p ())
6784             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6785                              "condition depends on previous iteration\n");
6786           return false;
6787         }
6788
6789       if (reduc_chain_length == 1
6790           && direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6791                                              vectype_in, OPTIMIZE_FOR_SPEED))
6792         {
6793           if (dump_enabled_p ())
6794             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6795                              "optimizing condition reduction with"
6796                              " FOLD_EXTRACT_LAST.\n");
6797           STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
6798         }
6799       else if (cond_reduc_dt == vect_induction_def)
6800         {
6801           tree base
6802             = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6803           tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6804
6805           gcc_assert (TREE_CODE (base) == INTEGER_CST
6806                       && TREE_CODE (step) == INTEGER_CST);
6807           cond_reduc_val = NULL_TREE;
6808           enum tree_code cond_reduc_op_code = ERROR_MARK;
6809           tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
6810           if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
6811             ;
6812           /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6813              above base; punt if base is the minimum value of the type for
6814              MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
6815           else if (tree_int_cst_sgn (step) == -1)
6816             {
6817               cond_reduc_op_code = MIN_EXPR;
6818               if (tree_int_cst_sgn (base) == -1)
6819                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6820               else if (tree_int_cst_lt (base,
6821                                         TYPE_MAX_VALUE (TREE_TYPE (base))))
6822                 cond_reduc_val
6823                   = int_const_binop (PLUS_EXPR, base, integer_one_node);
6824             }
6825           else
6826             {
6827               cond_reduc_op_code = MAX_EXPR;
6828               if (tree_int_cst_sgn (base) == 1)
6829                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6830               else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6831                                         base))
6832                 cond_reduc_val
6833                   = int_const_binop (MINUS_EXPR, base, integer_one_node);
6834             }
6835           if (cond_reduc_val)
6836             {
6837               if (dump_enabled_p ())
6838                 dump_printf_loc (MSG_NOTE, vect_location,
6839                                  "condition expression based on "
6840                                  "integer induction.\n");
6841               STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
6842               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
6843                 = cond_reduc_val;
6844               STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
6845             }
6846         }
6847       else if (cond_reduc_dt == vect_constant_def)
6848         {
6849           enum vect_def_type cond_initial_dt;
6850           tree cond_initial_val = vect_phi_initial_value (reduc_def_phi);
6851           vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
6852           if (cond_initial_dt == vect_constant_def
6853               && types_compatible_p (TREE_TYPE (cond_initial_val),
6854                                      TREE_TYPE (cond_reduc_val)))
6855             {
6856               tree e = fold_binary (LE_EXPR, boolean_type_node,
6857                                     cond_initial_val, cond_reduc_val);
6858               if (e && (integer_onep (e) || integer_zerop (e)))
6859                 {
6860                   if (dump_enabled_p ())
6861                     dump_printf_loc (MSG_NOTE, vect_location,
6862                                      "condition expression based on "
6863                                      "compile time constant.\n");
6864                   /* Record reduction code at analysis stage.  */
6865                   STMT_VINFO_REDUC_CODE (reduc_info)
6866                     = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6867                   STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
6868                 }
6869             }
6870         }
6871     }
6872
6873   if (STMT_VINFO_LIVE_P (phi_info))
6874     return false;
6875
6876   if (slp_node)
6877     ncopies = 1;
6878   else
6879     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6880
6881   gcc_assert (ncopies >= 1);
6882
6883   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6884
6885   if (nested_cycle)
6886     {
6887       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
6888                   == vect_double_reduction_def);
6889       double_reduc = true;
6890     }
6891
6892   /* 4.2. Check support for the epilog operation.
6893
6894           If STMT represents a reduction pattern, then the type of the
6895           reduction variable may be different than the type of the rest
6896           of the arguments.  For example, consider the case of accumulation
6897           of shorts into an int accumulator; The original code:
6898                         S1: int_a = (int) short_a;
6899           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
6900
6901           was replaced with:
6902                         STMT: int_acc = widen_sum <short_a, int_acc>
6903
6904           This means that:
6905           1. The tree-code that is used to create the vector operation in the
6906              epilog code (that reduces the partial results) is not the
6907              tree-code of STMT, but is rather the tree-code of the original
6908              stmt from the pattern that STMT is replacing.  I.e, in the example
6909              above we want to use 'widen_sum' in the loop, but 'plus' in the
6910              epilog.
6911           2. The type (mode) we use to check available target support
6912              for the vector operation to be created in the *epilog*, is
6913              determined by the type of the reduction variable (in the example
6914              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6915              However the type (mode) we use to check available target support
6916              for the vector operation to be created *inside the loop*, is
6917              determined by the type of the other arguments to STMT (in the
6918              example we'd check this: optab_handler (widen_sum_optab,
6919              vect_short_mode)).
6920
6921           This is contrary to "regular" reductions, in which the types of all
6922           the arguments are the same as the type of the reduction variable.
6923           For "regular" reductions we can therefore use the same vector type
6924           (and also the same tree-code) when generating the epilog code and
6925           when generating the code inside the loop.  */
6926
6927   enum tree_code orig_code = STMT_VINFO_REDUC_CODE (phi_info);
6928   STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
6929
6930   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
6931   if (reduction_type == TREE_CODE_REDUCTION)
6932     {
6933       /* Check whether it's ok to change the order of the computation.
6934          Generally, when vectorizing a reduction we change the order of the
6935          computation.  This may change the behavior of the program in some
6936          cases, so we need to check that this is ok.  One exception is when
6937          vectorizing an outer-loop: the inner-loop is executed sequentially,
6938          and therefore vectorizing reductions in the inner-loop during
6939          outer-loop vectorization is safe.  Likewise when we are vectorizing
6940          a series of reductions using SLP and the VF is one the reductions
6941          are performed in scalar order.  */
6942       if (slp_node
6943           && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6944           && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1u))
6945         ;
6946       else if (needs_fold_left_reduction_p (scalar_type, orig_code))
6947         {
6948           /* When vectorizing a reduction chain w/o SLP the reduction PHI
6949              is not directy used in stmt.  */
6950           if (!only_slp_reduc_chain
6951               && reduc_chain_length != 1)
6952             {
6953               if (dump_enabled_p ())
6954                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6955                                  "in-order reduction chain without SLP.\n");
6956               return false;
6957             }
6958           STMT_VINFO_REDUC_TYPE (reduc_info)
6959             = reduction_type = FOLD_LEFT_REDUCTION;
6960         }
6961       else if (!commutative_tree_code (orig_code)
6962                || !associative_tree_code (orig_code))
6963         {
6964           if (dump_enabled_p ())
6965             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6966                             "reduction: not commutative/associative");
6967           return false;
6968         }
6969     }
6970
6971   if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
6972       && ncopies > 1)
6973     {
6974       if (dump_enabled_p ())
6975         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6976                          "multiple types in double reduction or condition "
6977                          "reduction or fold-left reduction.\n");
6978       return false;
6979     }
6980
6981   internal_fn reduc_fn = IFN_LAST;
6982   if (reduction_type == TREE_CODE_REDUCTION
6983       || reduction_type == FOLD_LEFT_REDUCTION
6984       || reduction_type == INTEGER_INDUC_COND_REDUCTION
6985       || reduction_type == CONST_COND_REDUCTION)
6986     {
6987       if (reduction_type == FOLD_LEFT_REDUCTION
6988           ? fold_left_reduction_fn (orig_code, &reduc_fn)
6989           : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6990         {
6991           if (reduc_fn != IFN_LAST
6992               && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6993                                                   OPTIMIZE_FOR_SPEED))
6994             {
6995               if (dump_enabled_p ())
6996                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6997                                  "reduc op not supported by target.\n");
6998
6999               reduc_fn = IFN_LAST;
7000             }
7001         }
7002       else
7003         {
7004           if (!nested_cycle || double_reduc)
7005             {
7006               if (dump_enabled_p ())
7007                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7008                                  "no reduc code for scalar code.\n");
7009
7010               return false;
7011             }
7012         }
7013     }
7014   else if (reduction_type == COND_REDUCTION)
7015     {
7016       int scalar_precision
7017         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
7018       cr_index_scalar_type = make_unsigned_type (scalar_precision);
7019       cr_index_vector_type = get_same_sized_vectype (cr_index_scalar_type,
7020                                                 vectype_out);
7021
7022       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
7023                                           OPTIMIZE_FOR_SPEED))
7024         reduc_fn = IFN_REDUC_MAX;
7025     }
7026   STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
7027
7028   if (reduction_type != EXTRACT_LAST_REDUCTION
7029       && (!nested_cycle || double_reduc)
7030       && reduc_fn == IFN_LAST
7031       && !nunits_out.is_constant ())
7032     {
7033       if (dump_enabled_p ())
7034         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7035                          "missing target support for reduction on"
7036                          " variable-length vectors.\n");
7037       return false;
7038     }
7039
7040   /* For SLP reductions, see if there is a neutral value we can use.  */
7041   tree neutral_op = NULL_TREE;
7042   if (slp_node)
7043     {
7044       tree initial_value = NULL_TREE;
7045       if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL)
7046         initial_value = vect_phi_initial_value (reduc_def_phi);
7047       neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype_out),
7048                                              orig_code, initial_value);
7049     }
7050
7051   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
7052     {
7053       /* We can't support in-order reductions of code such as this:
7054
7055            for (int i = 0; i < n1; ++i)
7056              for (int j = 0; j < n2; ++j)
7057                l += a[j];
7058
7059          since GCC effectively transforms the loop when vectorizing:
7060
7061            for (int i = 0; i < n1 / VF; ++i)
7062              for (int j = 0; j < n2; ++j)
7063                for (int k = 0; k < VF; ++k)
7064                  l += a[j];
7065
7066          which is a reassociation of the original operation.  */
7067       if (dump_enabled_p ())
7068         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7069                          "in-order double reduction not supported.\n");
7070
7071       return false;
7072     }
7073
7074   if (reduction_type == FOLD_LEFT_REDUCTION
7075       && slp_node
7076       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7077     {
7078       /* We cannot use in-order reductions in this case because there is
7079          an implicit reassociation of the operations involved.  */
7080       if (dump_enabled_p ())
7081         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7082                          "in-order unchained SLP reductions not supported.\n");
7083       return false;
7084     }
7085
7086   /* For double reductions, and for SLP reductions with a neutral value,
7087      we construct a variable-length initial vector by loading a vector
7088      full of the neutral value and then shift-and-inserting the start
7089      values into the low-numbered elements.  */
7090   if ((double_reduc || neutral_op)
7091       && !nunits_out.is_constant ()
7092       && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
7093                                           vectype_out, OPTIMIZE_FOR_SPEED))
7094     {
7095       if (dump_enabled_p ())
7096         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7097                          "reduction on variable-length vectors requires"
7098                          " target support for a vector-shift-and-insert"
7099                          " operation.\n");
7100       return false;
7101     }
7102
7103   /* Check extra constraints for variable-length unchained SLP reductions.  */
7104   if (STMT_SLP_TYPE (stmt_info)
7105       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7106       && !nunits_out.is_constant ())
7107     {
7108       /* We checked above that we could build the initial vector when
7109          there's a neutral element value.  Check here for the case in
7110          which each SLP statement has its own initial value and in which
7111          that value needs to be repeated for every instance of the
7112          statement within the initial vector.  */
7113       unsigned int group_size = SLP_TREE_LANES (slp_node);
7114       if (!neutral_op
7115           && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
7116                                               TREE_TYPE (vectype_out)))
7117         {
7118           if (dump_enabled_p ())
7119             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7120                              "unsupported form of SLP reduction for"
7121                              " variable-length vectors: cannot build"
7122                              " initial vector.\n");
7123           return false;
7124         }
7125       /* The epilogue code relies on the number of elements being a multiple
7126          of the group size.  The duplicate-and-interleave approach to setting
7127          up the initial vector does too.  */
7128       if (!multiple_p (nunits_out, group_size))
7129         {
7130           if (dump_enabled_p ())
7131             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7132                              "unsupported form of SLP reduction for"
7133                              " variable-length vectors: the vector size"
7134                              " is not a multiple of the number of results.\n");
7135           return false;
7136         }
7137     }
7138
7139   if (reduction_type == COND_REDUCTION)
7140     {
7141       widest_int ni;
7142
7143       if (! max_loop_iterations (loop, &ni))
7144         {
7145           if (dump_enabled_p ())
7146             dump_printf_loc (MSG_NOTE, vect_location,
7147                              "loop count not known, cannot create cond "
7148                              "reduction.\n");
7149           return false;
7150         }
7151       /* Convert backedges to iterations.  */
7152       ni += 1;
7153
7154       /* The additional index will be the same type as the condition.  Check
7155          that the loop can fit into this less one (because we'll use up the
7156          zero slot for when there are no matches).  */
7157       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
7158       if (wi::geu_p (ni, wi::to_widest (max_index)))
7159         {
7160           if (dump_enabled_p ())
7161             dump_printf_loc (MSG_NOTE, vect_location,
7162                              "loop size is greater than data size.\n");
7163           return false;
7164         }
7165     }
7166
7167   /* In case the vectorization factor (VF) is bigger than the number
7168      of elements that we can fit in a vectype (nunits), we have to generate
7169      more than one vector stmt - i.e - we need to "unroll" the
7170      vector stmt by a factor VF/nunits.  For more details see documentation
7171      in vectorizable_operation.  */
7172
7173   /* If the reduction is used in an outer loop we need to generate
7174      VF intermediate results, like so (e.g. for ncopies=2):
7175         r0 = phi (init, r0)
7176         r1 = phi (init, r1)
7177         r0 = x0 + r0;
7178         r1 = x1 + r1;
7179     (i.e. we generate VF results in 2 registers).
7180     In this case we have a separate def-use cycle for each copy, and therefore
7181     for each copy we get the vector def for the reduction variable from the
7182     respective phi node created for this copy.
7183
7184     Otherwise (the reduction is unused in the loop nest), we can combine
7185     together intermediate results, like so (e.g. for ncopies=2):
7186         r = phi (init, r)
7187         r = x0 + r;
7188         r = x1 + r;
7189    (i.e. we generate VF/2 results in a single register).
7190    In this case for each copy we get the vector def for the reduction variable
7191    from the vectorized reduction operation generated in the previous iteration.
7192
7193    This only works when we see both the reduction PHI and its only consumer
7194    in vectorizable_reduction and there are no intermediate stmts
7195    participating.  */
7196   if (ncopies > 1
7197       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
7198       && reduc_chain_length == 1)
7199     single_defuse_cycle = true;
7200
7201   if (single_defuse_cycle || lane_reduc_code_p)
7202     {
7203       gcc_assert (code != COND_EXPR);
7204
7205       /* 4. Supportable by target?  */
7206       bool ok = true;
7207
7208       /* 4.1. check support for the operation in the loop  */
7209       optab optab = optab_for_tree_code (code, vectype_in, optab_query_kind);
7210       if (!optab)
7211         {
7212           if (dump_enabled_p ())
7213             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7214                              "no optab.\n");
7215           ok = false;
7216         }
7217
7218       machine_mode vec_mode = TYPE_MODE (vectype_in);
7219       if (ok && optab_handler (optab, vec_mode) == CODE_FOR_nothing)
7220         {
7221           if (dump_enabled_p ())
7222             dump_printf (MSG_NOTE, "op not supported by target.\n");
7223           if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
7224               || !vect_can_vectorize_without_simd_p (code))
7225             ok = false;
7226           else
7227             if (dump_enabled_p ())
7228               dump_printf (MSG_NOTE, "proceeding using word mode.\n");
7229         }
7230
7231       if (vect_emulated_vector_p (vectype_in)
7232           && !vect_can_vectorize_without_simd_p (code))
7233         {
7234           if (dump_enabled_p ())
7235             dump_printf (MSG_NOTE, "using word mode not possible.\n");
7236           return false;
7237         }
7238
7239       /* lane-reducing operations have to go through vect_transform_reduction.
7240          For the other cases try without the single cycle optimization.  */
7241       if (!ok)
7242         {
7243           if (lane_reduc_code_p)
7244             return false;
7245           else
7246             single_defuse_cycle = false;
7247         }
7248     }
7249   STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
7250
7251   /* If the reduction stmt is one of the patterns that have lane
7252      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
7253   if ((ncopies > 1 && ! single_defuse_cycle)
7254       && lane_reduc_code_p)
7255     {
7256       if (dump_enabled_p ())
7257         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7258                          "multi def-use cycle not possible for lane-reducing "
7259                          "reduction operation\n");
7260       return false;
7261     }
7262
7263   if (slp_node
7264       && !(!single_defuse_cycle
7265            && code != DOT_PROD_EXPR
7266            && code != WIDEN_SUM_EXPR
7267            && code != SAD_EXPR
7268            && reduction_type != FOLD_LEFT_REDUCTION))
7269     for (i = 0; i < op_type; i++)
7270       if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_in))
7271         {
7272           if (dump_enabled_p ())
7273             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7274                              "incompatible vector types for invariants\n");
7275           return false;
7276         }
7277
7278   if (slp_node)
7279     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7280   else
7281     vec_num = 1;
7282
7283   vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
7284                              reduction_type, ncopies, cost_vec);
7285   /* Cost the reduction op inside the loop if transformed via
7286      vect_transform_reduction.  Otherwise this is costed by the
7287      separate vectorizable_* routines.  */
7288   if (single_defuse_cycle
7289       || code == DOT_PROD_EXPR
7290       || code == WIDEN_SUM_EXPR
7291       || code == SAD_EXPR)
7292     record_stmt_cost (cost_vec, ncopies, vector_stmt, stmt_info, 0, vect_body);
7293
7294   if (dump_enabled_p ()
7295       && reduction_type == FOLD_LEFT_REDUCTION)
7296     dump_printf_loc (MSG_NOTE, vect_location,
7297                      "using an in-order (fold-left) reduction.\n");
7298   STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
7299   /* All but single defuse-cycle optimized, lane-reducing and fold-left
7300      reductions go through their own vectorizable_* routines.  */
7301   if (!single_defuse_cycle
7302       && code != DOT_PROD_EXPR
7303       && code != WIDEN_SUM_EXPR
7304       && code != SAD_EXPR
7305       && reduction_type != FOLD_LEFT_REDUCTION)
7306     {
7307       stmt_vec_info tem
7308         = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
7309       if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
7310         {
7311           gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
7312           tem = REDUC_GROUP_FIRST_ELEMENT (tem);
7313         }
7314       STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
7315       STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
7316     }
7317   else if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
7318     {
7319       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7320       internal_fn cond_fn = get_conditional_internal_fn (code);
7321
7322       if (reduction_type != FOLD_LEFT_REDUCTION
7323           && !use_mask_by_cond_expr_p (code, cond_fn, vectype_in)
7324           && (cond_fn == IFN_LAST
7325               || !direct_internal_fn_supported_p (cond_fn, vectype_in,
7326                                                   OPTIMIZE_FOR_SPEED)))
7327         {
7328           if (dump_enabled_p ())
7329             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7330                              "can't operate on partial vectors because"
7331                              " no conditional operation is available.\n");
7332           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7333         }
7334       else if (reduction_type == FOLD_LEFT_REDUCTION
7335                && reduc_fn == IFN_LAST
7336                && !expand_vec_cond_expr_p (vectype_in,
7337                                            truth_type_for (vectype_in),
7338                                            SSA_NAME))
7339         {
7340           if (dump_enabled_p ())
7341             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7342                              "can't operate on partial vectors because"
7343                              " no conditional operation is available.\n");
7344           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7345         }
7346       else
7347         vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
7348                                vectype_in, NULL);
7349     }
7350   return true;
7351 }
7352
7353 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
7354    value.  */
7355
7356 bool
7357 vect_transform_reduction (loop_vec_info loop_vinfo,
7358                           stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
7359                           gimple **vec_stmt, slp_tree slp_node)
7360 {
7361   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7362   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7363   int i;
7364   int ncopies;
7365   int vec_num;
7366
7367   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7368   gcc_assert (reduc_info->is_reduc_info);
7369
7370   if (nested_in_vect_loop_p (loop, stmt_info))
7371     {
7372       loop = loop->inner;
7373       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
7374     }
7375
7376   gassign *stmt = as_a <gassign *> (stmt_info->stmt);
7377   enum tree_code code = gimple_assign_rhs_code (stmt);
7378   int op_type = TREE_CODE_LENGTH (code);
7379
7380   /* Flatten RHS.  */
7381   tree ops[3];
7382   switch (get_gimple_rhs_class (code))
7383     {
7384     case GIMPLE_TERNARY_RHS:
7385       ops[2] = gimple_assign_rhs3 (stmt);
7386       /* Fall thru.  */
7387     case GIMPLE_BINARY_RHS:
7388       ops[0] = gimple_assign_rhs1 (stmt);
7389       ops[1] = gimple_assign_rhs2 (stmt);
7390       break;
7391     default:
7392       gcc_unreachable ();
7393     }
7394
7395   /* All uses but the last are expected to be defined in the loop.
7396      The last use is the reduction variable.  In case of nested cycle this
7397      assumption is not true: we use reduc_index to record the index of the
7398      reduction variable.  */
7399   stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
7400   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
7401   int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
7402   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7403
7404   if (slp_node)
7405     {
7406       ncopies = 1;
7407       vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7408     }
7409   else
7410     {
7411       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7412       vec_num = 1;
7413     }
7414
7415   internal_fn cond_fn = get_conditional_internal_fn (code);
7416   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7417   bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
7418
7419   /* Transform.  */
7420   tree new_temp = NULL_TREE;
7421   auto_vec<tree> vec_oprnds0;
7422   auto_vec<tree> vec_oprnds1;
7423   auto_vec<tree> vec_oprnds2;
7424   tree def0;
7425
7426   if (dump_enabled_p ())
7427     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7428
7429   /* FORNOW: Multiple types are not supported for condition.  */
7430   if (code == COND_EXPR)
7431     gcc_assert (ncopies == 1);
7432
7433   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7434
7435   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7436   if (reduction_type == FOLD_LEFT_REDUCTION)
7437     {
7438       internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
7439       return vectorize_fold_left_reduction
7440           (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, code,
7441            reduc_fn, ops, vectype_in, reduc_index, masks);
7442     }
7443
7444   bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
7445   gcc_assert (single_defuse_cycle
7446               || code == DOT_PROD_EXPR
7447               || code == WIDEN_SUM_EXPR
7448               || code == SAD_EXPR);
7449
7450   /* Create the destination vector  */
7451   tree scalar_dest = gimple_assign_lhs (stmt);
7452   tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7453
7454   vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
7455                      single_defuse_cycle && reduc_index == 0
7456                      ? NULL_TREE : ops[0], &vec_oprnds0,
7457                      single_defuse_cycle && reduc_index == 1
7458                      ? NULL_TREE : ops[1], &vec_oprnds1,
7459                      op_type == ternary_op
7460                      && !(single_defuse_cycle && reduc_index == 2)
7461                      ? ops[2] : NULL_TREE, &vec_oprnds2);
7462   if (single_defuse_cycle)
7463     {
7464       gcc_assert (!slp_node);
7465       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7466                                      ops[reduc_index],
7467                                      reduc_index == 0 ? &vec_oprnds0
7468                                      : (reduc_index == 1 ? &vec_oprnds1
7469                                         : &vec_oprnds2));
7470     }
7471
7472   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7473     {
7474       gimple *new_stmt;
7475       tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7476       if (masked_loop_p && !mask_by_cond_expr)
7477         {
7478           /* Make sure that the reduction accumulator is vop[0].  */
7479           if (reduc_index == 1)
7480             {
7481               gcc_assert (commutative_tree_code (code));
7482               std::swap (vop[0], vop[1]);
7483             }
7484           tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7485                                           vectype_in, i);
7486           gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
7487                                                     vop[0], vop[1], vop[0]);
7488           new_temp = make_ssa_name (vec_dest, call);
7489           gimple_call_set_lhs (call, new_temp);
7490           gimple_call_set_nothrow (call, true);
7491           vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
7492           new_stmt = call;
7493         }
7494       else
7495         {
7496           if (op_type == ternary_op)
7497             vop[2] = vec_oprnds2[i];
7498
7499           if (masked_loop_p && mask_by_cond_expr)
7500             {
7501               tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7502                                               vectype_in, i);
7503               build_vect_cond_expr (code, vop, mask, gsi);
7504             }
7505
7506           new_stmt = gimple_build_assign (vec_dest, code,
7507                                           vop[0], vop[1], vop[2]);
7508           new_temp = make_ssa_name (vec_dest, new_stmt);
7509           gimple_assign_set_lhs (new_stmt, new_temp);
7510           vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7511         }
7512
7513       if (slp_node)
7514         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7515       else if (single_defuse_cycle
7516                && i < ncopies - 1)
7517         {
7518           if (reduc_index == 0)
7519             vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
7520           else if (reduc_index == 1)
7521             vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
7522           else if (reduc_index == 2)
7523             vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
7524         }
7525       else
7526         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
7527     }
7528
7529   if (!slp_node)
7530     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
7531
7532   return true;
7533 }
7534
7535 /* Transform phase of a cycle PHI.  */
7536
7537 bool
7538 vect_transform_cycle_phi (loop_vec_info loop_vinfo,
7539                           stmt_vec_info stmt_info, gimple **vec_stmt,
7540                           slp_tree slp_node, slp_instance slp_node_instance)
7541 {
7542   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7543   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7544   int i;
7545   int ncopies;
7546   int j;
7547   bool nested_cycle = false;
7548   int vec_num;
7549
7550   if (nested_in_vect_loop_p (loop, stmt_info))
7551     {
7552       loop = loop->inner;
7553       nested_cycle = true;
7554     }
7555
7556   stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
7557   reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
7558   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7559   gcc_assert (reduc_info->is_reduc_info);
7560
7561   if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
7562       || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
7563     /* Leave the scalar phi in place.  */
7564     return true;
7565
7566   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7567   /* For a nested cycle we do not fill the above.  */
7568   if (!vectype_in)
7569     vectype_in = STMT_VINFO_VECTYPE (stmt_info);
7570   gcc_assert (vectype_in);
7571
7572   if (slp_node)
7573     {
7574       /* The size vect_schedule_slp_instance computes is off for us.  */
7575       vec_num = vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
7576                                       * SLP_TREE_LANES (slp_node), vectype_in);
7577       ncopies = 1;
7578     }
7579   else
7580     {
7581       vec_num = 1;
7582       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7583     }
7584
7585   /* Check whether we should use a single PHI node and accumulate
7586      vectors to one before the backedge.  */
7587   if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
7588     ncopies = 1;
7589
7590   /* Create the destination vector  */
7591   gphi *phi = as_a <gphi *> (stmt_info->stmt);
7592   tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
7593                                                vectype_out);
7594
7595   /* Get the loop-entry arguments.  */
7596   tree vec_initial_def = NULL_TREE;
7597   auto_vec<tree> vec_initial_defs;
7598   if (slp_node)
7599     {
7600       vec_initial_defs.reserve (vec_num);
7601       if (nested_cycle)
7602         {
7603           unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
7604           vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
7605                              &vec_initial_defs);
7606         }
7607       else
7608         {
7609           gcc_assert (slp_node == slp_node_instance->reduc_phis);
7610           vec<tree> &initial_values = reduc_info->reduc_initial_values;
7611           vec<stmt_vec_info> &stmts = SLP_TREE_SCALAR_STMTS (slp_node);
7612
7613           unsigned int num_phis = stmts.length ();
7614           if (REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
7615             num_phis = 1;
7616           initial_values.reserve (num_phis);
7617           for (unsigned int i = 0; i < num_phis; ++i)
7618             {
7619               gphi *this_phi = as_a<gphi *> (stmts[i]->stmt);
7620               initial_values.quick_push (vect_phi_initial_value (this_phi));
7621             }
7622           if (vec_num == 1)
7623             vect_find_reusable_accumulator (loop_vinfo, reduc_info);
7624           if (!initial_values.is_empty ())
7625             {
7626               tree initial_value
7627                 = (num_phis == 1 ? initial_values[0] : NULL_TREE);
7628               tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
7629               tree neutral_op
7630                 = neutral_op_for_reduction (TREE_TYPE (vectype_out),
7631                                             code, initial_value);
7632               get_initial_defs_for_reduction (loop_vinfo, reduc_info,
7633                                               &vec_initial_defs, vec_num,
7634                                               stmts.length (), neutral_op);
7635             }
7636         }
7637     }
7638   else
7639     {
7640       /* Get at the scalar def before the loop, that defines the initial
7641          value of the reduction variable.  */
7642       tree initial_def = vect_phi_initial_value (phi);
7643       reduc_info->reduc_initial_values.safe_push (initial_def);
7644       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
7645          and we can't use zero for induc_val, use initial_def.  Similarly
7646          for REDUC_MIN and initial_def larger than the base.  */
7647       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
7648         {
7649           tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
7650           if (TREE_CODE (initial_def) == INTEGER_CST
7651               && !integer_zerop (induc_val)
7652               && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
7653                    && tree_int_cst_lt (initial_def, induc_val))
7654                   || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
7655                       && tree_int_cst_lt (induc_val, initial_def))))
7656             {
7657               induc_val = initial_def;
7658               /* Communicate we used the initial_def to epilouge
7659                  generation.  */
7660               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
7661             }
7662           vec_initial_def = build_vector_from_val (vectype_out, induc_val);
7663         }
7664       else if (nested_cycle)
7665         {
7666           /* Do not use an adjustment def as that case is not supported
7667              correctly if ncopies is not one.  */
7668           vect_get_vec_defs_for_operand (loop_vinfo, reduc_stmt_info,
7669                                          ncopies, initial_def,
7670                                          &vec_initial_defs);
7671         }
7672       else if (STMT_VINFO_REDUC_TYPE (reduc_info) == CONST_COND_REDUCTION
7673                || STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
7674         /* Fill the initial vector with the initial scalar value.  */
7675         vec_initial_def
7676           = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info,
7677                                            initial_def, initial_def);
7678       else
7679         {
7680           if (ncopies == 1)
7681             vect_find_reusable_accumulator (loop_vinfo, reduc_info);
7682           if (!reduc_info->reduc_initial_values.is_empty ())
7683             {
7684               initial_def = reduc_info->reduc_initial_values[0];
7685               enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
7686               tree neutral_op
7687                 = neutral_op_for_reduction (TREE_TYPE (initial_def),
7688                                             code, initial_def);
7689               gcc_assert (neutral_op);
7690               /* Try to simplify the vector initialization by applying an
7691                  adjustment after the reduction has been performed.  */
7692               if (!reduc_info->reused_accumulator
7693                   && STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
7694                   && !operand_equal_p (neutral_op, initial_def))
7695                 {
7696                   STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info)
7697                     = initial_def;
7698                   initial_def = neutral_op;
7699                 }
7700               vec_initial_def
7701                 = get_initial_def_for_reduction (loop_vinfo, reduc_info,
7702                                                  initial_def, neutral_op);
7703             }
7704         }
7705     }
7706
7707   if (vec_initial_def)
7708     {
7709       vec_initial_defs.create (ncopies);
7710       for (i = 0; i < ncopies; ++i)
7711         vec_initial_defs.quick_push (vec_initial_def);
7712     }
7713
7714   if (auto *accumulator = reduc_info->reused_accumulator)
7715     {
7716       tree def = accumulator->reduc_input;
7717       if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
7718         {
7719           unsigned int nreduc;
7720           bool res = constant_multiple_p (TYPE_VECTOR_SUBPARTS
7721                                             (TREE_TYPE (def)),
7722                                           TYPE_VECTOR_SUBPARTS (vectype_out),
7723                                           &nreduc);
7724           gcc_assert (res);
7725           gimple_seq stmts = NULL;
7726           /* Reduce the single vector to a smaller one.  */
7727           if (nreduc != 1)
7728             {
7729               /* Perform the reduction in the appropriate type.  */
7730               tree rvectype = vectype_out;
7731               if (!useless_type_conversion_p (TREE_TYPE (vectype_out),
7732                                               TREE_TYPE (TREE_TYPE (def))))
7733                 rvectype = build_vector_type (TREE_TYPE (TREE_TYPE (def)),
7734                                               TYPE_VECTOR_SUBPARTS
7735                                                 (vectype_out));
7736               def = vect_create_partial_epilog (def, rvectype,
7737                                                 STMT_VINFO_REDUC_CODE
7738                                                   (reduc_info),
7739                                                 &stmts);
7740             }
7741           /* The epilogue loop might use a different vector mode, like
7742              VNx2DI vs. V2DI.  */
7743           if (TYPE_MODE (vectype_out) != TYPE_MODE (TREE_TYPE (def)))
7744             {
7745               tree reduc_type = build_vector_type_for_mode
7746                 (TREE_TYPE (TREE_TYPE (def)), TYPE_MODE (vectype_out));
7747               def = gimple_convert (&stmts, reduc_type, def);
7748             }
7749           /* Adjust the input so we pick up the partially reduced value
7750              for the skip edge in vect_create_epilog_for_reduction.  */
7751           accumulator->reduc_input = def;
7752           /* And the reduction could be carried out using a different sign.  */
7753           if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
7754             def = gimple_convert (&stmts, vectype_out, def);
7755           if (loop_vinfo->main_loop_edge)
7756             {
7757               /* While we'd like to insert on the edge this will split
7758                  blocks and disturb bookkeeping, we also will eventually
7759                  need this on the skip edge.  Rely on sinking to
7760                  fixup optimal placement and insert in the pred.  */
7761               gimple_stmt_iterator gsi
7762                 = gsi_last_bb (loop_vinfo->main_loop_edge->src);
7763               /* Insert before a cond that eventually skips the
7764                  epilogue.  */
7765               if (!gsi_end_p (gsi) && stmt_ends_bb_p (gsi_stmt (gsi)))
7766                 gsi_prev (&gsi);
7767               gsi_insert_seq_after (&gsi, stmts, GSI_CONTINUE_LINKING);
7768             }
7769           else
7770             gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop),
7771                                               stmts);
7772         }
7773       if (loop_vinfo->main_loop_edge)
7774         vec_initial_defs[0]
7775           = vect_get_main_loop_result (loop_vinfo, def,
7776                                        vec_initial_defs[0]);
7777       else
7778         vec_initial_defs.safe_push (def);
7779     }
7780
7781   /* Generate the reduction PHIs upfront.  */
7782   for (i = 0; i < vec_num; i++)
7783     {
7784       tree vec_init_def = vec_initial_defs[i];
7785       for (j = 0; j < ncopies; j++)
7786         {
7787           /* Create the reduction-phi that defines the reduction
7788              operand.  */
7789           gphi *new_phi = create_phi_node (vec_dest, loop->header);
7790
7791           /* Set the loop-entry arg of the reduction-phi.  */
7792           if (j != 0 && nested_cycle)
7793             vec_init_def = vec_initial_defs[j];
7794           add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
7795                        UNKNOWN_LOCATION);
7796
7797           /* The loop-latch arg is set in epilogue processing.  */
7798
7799           if (slp_node)
7800             SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
7801           else
7802             {
7803               if (j == 0)
7804                 *vec_stmt = new_phi;
7805               STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
7806             }
7807         }
7808     }
7809
7810   return true;
7811 }
7812
7813 /* Vectorizes LC PHIs.  */
7814
7815 bool
7816 vectorizable_lc_phi (loop_vec_info loop_vinfo,
7817                      stmt_vec_info stmt_info, gimple **vec_stmt,
7818                      slp_tree slp_node)
7819 {
7820   if (!loop_vinfo
7821       || !is_a <gphi *> (stmt_info->stmt)
7822       || gimple_phi_num_args (stmt_info->stmt) != 1)
7823     return false;
7824
7825   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
7826       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
7827     return false;
7828
7829   if (!vec_stmt) /* transformation not required.  */
7830     {
7831       /* Deal with copies from externs or constants that disguise as
7832          loop-closed PHI nodes (PR97886).  */
7833       if (slp_node
7834           && !vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0],
7835                                                 SLP_TREE_VECTYPE (slp_node)))
7836         {
7837           if (dump_enabled_p ())
7838             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7839                              "incompatible vector types for invariants\n");
7840           return false;
7841         }
7842       STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
7843       return true;
7844     }
7845
7846   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7847   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
7848   basic_block bb = gimple_bb (stmt_info->stmt);
7849   edge e = single_pred_edge (bb);
7850   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
7851   auto_vec<tree> vec_oprnds;
7852   vect_get_vec_defs (loop_vinfo, stmt_info, slp_node,
7853                      !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1,
7854                      gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
7855   for (unsigned i = 0; i < vec_oprnds.length (); i++)
7856     {
7857       /* Create the vectorized LC PHI node.  */
7858       gphi *new_phi = create_phi_node (vec_dest, bb);
7859       add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
7860       if (slp_node)
7861         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
7862       else
7863         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
7864     }
7865   if (!slp_node)
7866     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
7867
7868   return true;
7869 }
7870
7871 /* Vectorizes PHIs.  */
7872
7873 bool
7874 vectorizable_phi (vec_info *,
7875                   stmt_vec_info stmt_info, gimple **vec_stmt,
7876                   slp_tree slp_node, stmt_vector_for_cost *cost_vec)
7877 {
7878   if (!is_a <gphi *> (stmt_info->stmt) || !slp_node)
7879     return false;
7880
7881   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
7882     return false;
7883
7884   tree vectype = SLP_TREE_VECTYPE (slp_node);
7885
7886   if (!vec_stmt) /* transformation not required.  */
7887     {
7888       slp_tree child;
7889       unsigned i;
7890       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
7891         if (!child)
7892           {
7893             if (dump_enabled_p ())
7894               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7895                                "PHI node with unvectorized backedge def\n");
7896             return false;
7897           }
7898         else if (!vect_maybe_update_slp_op_vectype (child, vectype))
7899           {
7900             if (dump_enabled_p ())
7901               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7902                                "incompatible vector types for invariants\n");
7903             return false;
7904           }
7905       /* For single-argument PHIs assume coalescing which means zero cost
7906          for the scalar and the vector PHIs.  This avoids artificially
7907          favoring the vector path (but may pessimize it in some cases).  */
7908       if (gimple_phi_num_args (as_a <gphi *> (stmt_info->stmt)) > 1)
7909         record_stmt_cost (cost_vec, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
7910                           vector_stmt, stmt_info, vectype, 0, vect_body);
7911       STMT_VINFO_TYPE (stmt_info) = phi_info_type;
7912       return true;
7913     }
7914
7915   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
7916   basic_block bb = gimple_bb (stmt_info->stmt);
7917   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
7918   auto_vec<gphi *> new_phis;
7919   for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i)
7920     {
7921       slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
7922
7923       /* Skip not yet vectorized defs.  */
7924       if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
7925           && SLP_TREE_VEC_STMTS (child).is_empty ())
7926         continue;
7927
7928       auto_vec<tree> vec_oprnds;
7929       vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
7930       if (!new_phis.exists ())
7931         {
7932           new_phis.create (vec_oprnds.length ());
7933           for (unsigned j = 0; j < vec_oprnds.length (); j++)
7934             {
7935               /* Create the vectorized LC PHI node.  */
7936               new_phis.quick_push (create_phi_node (vec_dest, bb));
7937               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phis[j]);
7938             }
7939         }
7940       edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i);
7941       for (unsigned j = 0; j < vec_oprnds.length (); j++)
7942         add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
7943     }
7944   /* We should have at least one already vectorized child.  */
7945   gcc_assert (new_phis.exists ());
7946
7947   return true;
7948 }
7949
7950 /* Return true if VECTYPE represents a vector that requires lowering
7951    by the vector lowering pass.  */
7952
7953 bool
7954 vect_emulated_vector_p (tree vectype)
7955 {
7956   return (!VECTOR_MODE_P (TYPE_MODE (vectype))
7957           && (!VECTOR_BOOLEAN_TYPE_P (vectype)
7958               || TYPE_PRECISION (TREE_TYPE (vectype)) != 1));
7959 }
7960
7961 /* Return true if we can emulate CODE on an integer mode representation
7962    of a vector.  */
7963
7964 bool
7965 vect_can_vectorize_without_simd_p (tree_code code)
7966 {
7967   switch (code)
7968     {
7969     case PLUS_EXPR:
7970     case MINUS_EXPR:
7971     case NEGATE_EXPR:
7972     case BIT_AND_EXPR:
7973     case BIT_IOR_EXPR:
7974     case BIT_XOR_EXPR:
7975     case BIT_NOT_EXPR:
7976       return true;
7977
7978     default:
7979       return false;
7980     }
7981 }
7982
7983 /* Function vectorizable_induction
7984
7985    Check if STMT_INFO performs an induction computation that can be vectorized.
7986    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7987    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7988    Return true if STMT_INFO is vectorizable in this way.  */
7989
7990 bool
7991 vectorizable_induction (loop_vec_info loop_vinfo,
7992                         stmt_vec_info stmt_info,
7993                         gimple **vec_stmt, slp_tree slp_node,
7994                         stmt_vector_for_cost *cost_vec)
7995 {
7996   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7997   unsigned ncopies;
7998   bool nested_in_vect_loop = false;
7999   class loop *iv_loop;
8000   tree vec_def;
8001   edge pe = loop_preheader_edge (loop);
8002   basic_block new_bb;
8003   tree new_vec, vec_init, vec_step, t;
8004   tree new_name;
8005   gimple *new_stmt;
8006   gphi *induction_phi;
8007   tree induc_def, vec_dest;
8008   tree init_expr, step_expr;
8009   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8010   unsigned i;
8011   tree expr;
8012   gimple_stmt_iterator si;
8013
8014   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
8015   if (!phi)
8016     return false;
8017
8018   if (!STMT_VINFO_RELEVANT_P (stmt_info))
8019     return false;
8020
8021   /* Make sure it was recognized as induction computation.  */
8022   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
8023     return false;
8024
8025   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8026   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8027
8028   if (slp_node)
8029     ncopies = 1;
8030   else
8031     ncopies = vect_get_num_copies (loop_vinfo, vectype);
8032   gcc_assert (ncopies >= 1);
8033
8034   /* FORNOW. These restrictions should be relaxed.  */
8035   if (nested_in_vect_loop_p (loop, stmt_info))
8036     {
8037       imm_use_iterator imm_iter;
8038       use_operand_p use_p;
8039       gimple *exit_phi;
8040       edge latch_e;
8041       tree loop_arg;
8042
8043       if (ncopies > 1)
8044         {
8045           if (dump_enabled_p ())
8046             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8047                              "multiple types in nested loop.\n");
8048           return false;
8049         }
8050
8051       exit_phi = NULL;
8052       latch_e = loop_latch_edge (loop->inner);
8053       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
8054       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
8055         {
8056           gimple *use_stmt = USE_STMT (use_p);
8057           if (is_gimple_debug (use_stmt))
8058             continue;
8059
8060           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
8061             {
8062               exit_phi = use_stmt;
8063               break;
8064             }
8065         }
8066       if (exit_phi)
8067         {
8068           stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
8069           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
8070                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
8071             {
8072               if (dump_enabled_p ())
8073                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8074                                  "inner-loop induction only used outside "
8075                                  "of the outer vectorized loop.\n");
8076               return false;
8077             }
8078         }
8079
8080       nested_in_vect_loop = true;
8081       iv_loop = loop->inner;
8082     }
8083   else
8084     iv_loop = loop;
8085   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
8086
8087   if (slp_node && !nunits.is_constant ())
8088     {
8089       /* The current SLP code creates the step value element-by-element.  */
8090       if (dump_enabled_p ())
8091         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8092                          "SLP induction not supported for variable-length"
8093                          " vectors.\n");
8094       return false;
8095     }
8096
8097   if (!vec_stmt) /* transformation not required.  */
8098     {
8099       unsigned inside_cost = 0, prologue_cost = 0;
8100       if (slp_node)
8101         {
8102           /* We eventually need to set a vector type on invariant
8103              arguments.  */
8104           unsigned j;
8105           slp_tree child;
8106           FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
8107             if (!vect_maybe_update_slp_op_vectype
8108                 (child, SLP_TREE_VECTYPE (slp_node)))
8109               {
8110                 if (dump_enabled_p ())
8111                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8112                                    "incompatible vector types for "
8113                                    "invariants\n");
8114                 return false;
8115               }
8116           /* loop cost for vec_loop.  */
8117           inside_cost
8118             = record_stmt_cost (cost_vec,
8119                                 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
8120                                 vector_stmt, stmt_info, 0, vect_body);
8121           /* prologue cost for vec_init (if not nested) and step.  */
8122           prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
8123                                             scalar_to_vec,
8124                                             stmt_info, 0, vect_prologue);
8125         }
8126       else /* if (!slp_node) */
8127         {
8128           /* loop cost for vec_loop.  */
8129           inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
8130                                           stmt_info, 0, vect_body);
8131           /* prologue cost for vec_init and vec_step.  */
8132           prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
8133                                             stmt_info, 0, vect_prologue);
8134         }
8135       if (dump_enabled_p ())
8136         dump_printf_loc (MSG_NOTE, vect_location,
8137                          "vect_model_induction_cost: inside_cost = %d, "
8138                          "prologue_cost = %d .\n", inside_cost,
8139                          prologue_cost);
8140
8141       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
8142       DUMP_VECT_SCOPE ("vectorizable_induction");
8143       return true;
8144     }
8145
8146   /* Transform.  */
8147
8148   /* Compute a vector variable, initialized with the first VF values of
8149      the induction variable.  E.g., for an iv with IV_PHI='X' and
8150      evolution S, for a vector of 4 units, we want to compute:
8151      [X, X + S, X + 2*S, X + 3*S].  */
8152
8153   if (dump_enabled_p ())
8154     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
8155
8156   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
8157   gcc_assert (step_expr != NULL_TREE);
8158   tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
8159
8160   pe = loop_preheader_edge (iv_loop);
8161   /* Find the first insertion point in the BB.  */
8162   basic_block bb = gimple_bb (phi);
8163   si = gsi_after_labels (bb);
8164
8165   /* For SLP induction we have to generate several IVs as for example
8166      with group size 3 we need
8167        [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
8168        [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2].  */
8169   if (slp_node)
8170     {
8171       /* Enforced above.  */
8172       unsigned int const_nunits = nunits.to_constant ();
8173
8174       /* The initial values are vectorized, but any lanes > group_size
8175          need adjustment.  */
8176       slp_tree init_node
8177         = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
8178
8179       /* Gather steps.  Since we do not vectorize inductions as
8180          cycles we have to reconstruct the step from SCEV data.  */
8181       unsigned group_size = SLP_TREE_LANES (slp_node);
8182       tree *steps = XALLOCAVEC (tree, group_size);
8183       tree *inits = XALLOCAVEC (tree, group_size);
8184       stmt_vec_info phi_info;
8185       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
8186         {
8187           steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
8188           if (!init_node)
8189             inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt),
8190                                            pe->dest_idx);
8191         }
8192
8193       /* Now generate the IVs.  */
8194       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8195       gcc_assert ((const_nunits * nvects) % group_size == 0);
8196       unsigned nivs;
8197       if (nested_in_vect_loop)
8198         nivs = nvects;
8199       else
8200         {
8201           /* Compute the number of distinct IVs we need.  First reduce
8202              group_size if it is a multiple of const_nunits so we get
8203              one IV for a group_size of 4 but const_nunits 2.  */
8204           unsigned group_sizep = group_size;
8205           if (group_sizep % const_nunits == 0)
8206             group_sizep = group_sizep / const_nunits;
8207           nivs = least_common_multiple (group_sizep,
8208                                         const_nunits) / const_nunits;
8209         }
8210       tree stept = TREE_TYPE (step_vectype);
8211       tree lupdate_mul = NULL_TREE;
8212       if (!nested_in_vect_loop)
8213         {
8214           /* The number of iterations covered in one vector iteration.  */
8215           unsigned lup_mul = (nvects * const_nunits) / group_size;
8216           lupdate_mul
8217             = build_vector_from_val (step_vectype,
8218                                      SCALAR_FLOAT_TYPE_P (stept)
8219                                      ? build_real_from_wide (stept, lup_mul,
8220                                                              UNSIGNED)
8221                                      : build_int_cstu (stept, lup_mul));
8222         }
8223       tree peel_mul = NULL_TREE;
8224       gimple_seq init_stmts = NULL;
8225       if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
8226         {
8227           if (SCALAR_FLOAT_TYPE_P (stept))
8228             peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept,
8229                                      LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
8230           else
8231             peel_mul = gimple_convert (&init_stmts, stept,
8232                                        LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
8233           peel_mul = gimple_build_vector_from_val (&init_stmts,
8234                                                    step_vectype, peel_mul);
8235         }
8236       unsigned ivn;
8237       auto_vec<tree> vec_steps;
8238       for (ivn = 0; ivn < nivs; ++ivn)
8239         {
8240           tree_vector_builder step_elts (step_vectype, const_nunits, 1);
8241           tree_vector_builder init_elts (vectype, const_nunits, 1);
8242           tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
8243           for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
8244             {
8245               /* The scalar steps of the IVs.  */
8246               tree elt = steps[(ivn*const_nunits + eltn) % group_size];
8247               elt = gimple_convert (&init_stmts, TREE_TYPE (step_vectype), elt);
8248               step_elts.quick_push (elt);
8249               if (!init_node)
8250                 {
8251                   /* The scalar inits of the IVs if not vectorized.  */
8252                   elt = inits[(ivn*const_nunits + eltn) % group_size];
8253                   if (!useless_type_conversion_p (TREE_TYPE (vectype),
8254                                                   TREE_TYPE (elt)))
8255                     elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
8256                                         TREE_TYPE (vectype), elt);
8257                   init_elts.quick_push (elt);
8258                 }
8259               /* The number of steps to add to the initial values.  */
8260               unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
8261               mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
8262                                    ? build_real_from_wide (stept,
8263                                                            mul_elt, UNSIGNED)
8264                                    : build_int_cstu (stept, mul_elt));
8265             }
8266           vec_step = gimple_build_vector (&init_stmts, &step_elts);
8267           vec_steps.safe_push (vec_step);
8268           tree step_mul = gimple_build_vector (&init_stmts, &mul_elts);
8269           if (peel_mul)
8270             step_mul = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
8271                                      step_mul, peel_mul);
8272           if (!init_node)
8273             vec_init = gimple_build_vector (&init_stmts, &init_elts);
8274
8275           /* Create the induction-phi that defines the induction-operand.  */
8276           vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
8277                                             "vec_iv_");
8278           induction_phi = create_phi_node (vec_dest, iv_loop->header);
8279           induc_def = PHI_RESULT (induction_phi);
8280
8281           /* Create the iv update inside the loop  */
8282           tree up = vec_step;
8283           if (lupdate_mul)
8284             up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
8285                                vec_step, lupdate_mul);
8286           gimple_seq stmts = NULL;
8287           vec_def = gimple_convert (&stmts, step_vectype, induc_def);
8288           vec_def = gimple_build (&stmts,
8289                                   PLUS_EXPR, step_vectype, vec_def, up);
8290           vec_def = gimple_convert (&stmts, vectype, vec_def);
8291           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8292           add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
8293                        UNKNOWN_LOCATION);
8294
8295           if (init_node)
8296             vec_init = vect_get_slp_vect_def (init_node, ivn);
8297           if (!nested_in_vect_loop
8298               && !integer_zerop (step_mul))
8299             {
8300               vec_def = gimple_convert (&init_stmts, step_vectype, vec_init);
8301               up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
8302                                  vec_step, step_mul);
8303               vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
8304                                       vec_def, up);
8305               vec_init = gimple_convert (&init_stmts, vectype, vec_def);
8306             }
8307
8308           /* Set the arguments of the phi node:  */
8309           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
8310
8311           SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
8312         }
8313       if (!nested_in_vect_loop)
8314         {
8315           /* Fill up to the number of vectors we need for the whole group.  */
8316           nivs = least_common_multiple (group_size,
8317                                         const_nunits) / const_nunits;
8318           vec_steps.reserve (nivs-ivn);
8319           for (; ivn < nivs; ++ivn)
8320             {
8321               SLP_TREE_VEC_STMTS (slp_node)
8322                 .quick_push (SLP_TREE_VEC_STMTS (slp_node)[0]);
8323               vec_steps.quick_push (vec_steps[0]);
8324             }
8325         }
8326
8327       /* Re-use IVs when we can.  We are generating further vector
8328          stmts by adding VF' * stride to the IVs generated above.  */
8329       if (ivn < nvects)
8330         {
8331           unsigned vfp
8332             = least_common_multiple (group_size, const_nunits) / group_size;
8333           tree lupdate_mul
8334             = build_vector_from_val (step_vectype,
8335                                      SCALAR_FLOAT_TYPE_P (stept)
8336                                      ? build_real_from_wide (stept,
8337                                                              vfp, UNSIGNED)
8338                                      : build_int_cstu (stept, vfp));
8339           for (; ivn < nvects; ++ivn)
8340             {
8341               gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
8342               tree def = gimple_get_lhs (iv);
8343               if (ivn < 2*nivs)
8344                 vec_steps[ivn - nivs]
8345                   = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
8346                                   vec_steps[ivn - nivs], lupdate_mul);
8347               gimple_seq stmts = NULL;
8348               def = gimple_convert (&stmts, step_vectype, def);
8349               def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
8350                                   def, vec_steps[ivn % nivs]);
8351               def = gimple_convert (&stmts, vectype, def);
8352               if (gimple_code (iv) == GIMPLE_PHI)
8353                 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8354               else
8355                 {
8356                   gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
8357                   gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
8358                 }
8359               SLP_TREE_VEC_STMTS (slp_node)
8360                 .quick_push (SSA_NAME_DEF_STMT (def));
8361             }
8362         }
8363
8364       new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts);
8365       gcc_assert (!new_bb);
8366
8367       return true;
8368     }
8369
8370   init_expr = vect_phi_initial_value (phi);
8371
8372   gimple_seq stmts = NULL;
8373   if (!nested_in_vect_loop)
8374     {
8375       /* Convert the initial value to the IV update type.  */
8376       tree new_type = TREE_TYPE (step_expr);
8377       init_expr = gimple_convert (&stmts, new_type, init_expr);
8378
8379       /* If we are using the loop mask to "peel" for alignment then we need
8380          to adjust the start value here.  */
8381       tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
8382       if (skip_niters != NULL_TREE)
8383         {
8384           if (FLOAT_TYPE_P (vectype))
8385             skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
8386                                         skip_niters);
8387           else
8388             skip_niters = gimple_convert (&stmts, new_type, skip_niters);
8389           tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
8390                                          skip_niters, step_expr);
8391           init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
8392                                     init_expr, skip_step);
8393         }
8394     }
8395
8396   if (stmts)
8397     {
8398       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
8399       gcc_assert (!new_bb);
8400     }
8401
8402   /* Create the vector that holds the initial_value of the induction.  */
8403   if (nested_in_vect_loop)
8404     {
8405       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
8406          been created during vectorization of previous stmts.  We obtain it
8407          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
8408       auto_vec<tree> vec_inits;
8409       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
8410                                      init_expr, &vec_inits);
8411       vec_init = vec_inits[0];
8412       /* If the initial value is not of proper type, convert it.  */
8413       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
8414         {
8415           new_stmt
8416             = gimple_build_assign (vect_get_new_ssa_name (vectype,
8417                                                           vect_simple_var,
8418                                                           "vec_iv_"),
8419                                    VIEW_CONVERT_EXPR,
8420                                    build1 (VIEW_CONVERT_EXPR, vectype,
8421                                            vec_init));
8422           vec_init = gimple_assign_lhs (new_stmt);
8423           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
8424                                                  new_stmt);
8425           gcc_assert (!new_bb);
8426         }
8427     }
8428   else
8429     {
8430       /* iv_loop is the loop to be vectorized. Create:
8431          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
8432       stmts = NULL;
8433       new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
8434
8435       unsigned HOST_WIDE_INT const_nunits;
8436       if (nunits.is_constant (&const_nunits))
8437         {
8438           tree_vector_builder elts (step_vectype, const_nunits, 1);
8439           elts.quick_push (new_name);
8440           for (i = 1; i < const_nunits; i++)
8441             {
8442               /* Create: new_name_i = new_name + step_expr  */
8443               new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
8444                                        new_name, step_expr);
8445               elts.quick_push (new_name);
8446             }
8447           /* Create a vector from [new_name_0, new_name_1, ...,
8448              new_name_nunits-1]  */
8449           vec_init = gimple_build_vector (&stmts, &elts);
8450         }
8451       else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
8452         /* Build the initial value directly from a VEC_SERIES_EXPR.  */
8453         vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
8454                                  new_name, step_expr);
8455       else
8456         {
8457           /* Build:
8458                 [base, base, base, ...]
8459                 + (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
8460           gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
8461           gcc_assert (flag_associative_math);
8462           tree index = build_index_vector (step_vectype, 0, 1);
8463           tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
8464                                                         new_name);
8465           tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
8466                                                         step_expr);
8467           vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
8468           vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
8469                                    vec_init, step_vec);
8470           vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
8471                                    vec_init, base_vec);
8472         }
8473       vec_init = gimple_convert (&stmts, vectype, vec_init);
8474
8475       if (stmts)
8476         {
8477           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
8478           gcc_assert (!new_bb);
8479         }
8480     }
8481
8482
8483   /* Create the vector that holds the step of the induction.  */
8484   if (nested_in_vect_loop)
8485     /* iv_loop is nested in the loop to be vectorized. Generate:
8486        vec_step = [S, S, S, S]  */
8487     new_name = step_expr;
8488   else
8489     {
8490       /* iv_loop is the loop to be vectorized. Generate:
8491           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
8492       gimple_seq seq = NULL;
8493       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
8494         {
8495           expr = build_int_cst (integer_type_node, vf);
8496           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
8497         }
8498       else
8499         expr = build_int_cst (TREE_TYPE (step_expr), vf);
8500       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
8501                                expr, step_expr);
8502       if (seq)
8503         {
8504           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
8505           gcc_assert (!new_bb);
8506         }
8507     }
8508
8509   t = unshare_expr (new_name);
8510   gcc_assert (CONSTANT_CLASS_P (new_name)
8511               || TREE_CODE (new_name) == SSA_NAME);
8512   new_vec = build_vector_from_val (step_vectype, t);
8513   vec_step = vect_init_vector (loop_vinfo, stmt_info,
8514                                new_vec, step_vectype, NULL);
8515
8516
8517   /* Create the following def-use cycle:
8518      loop prolog:
8519          vec_init = ...
8520          vec_step = ...
8521      loop:
8522          vec_iv = PHI <vec_init, vec_loop>
8523          ...
8524          STMT
8525          ...
8526          vec_loop = vec_iv + vec_step;  */
8527
8528   /* Create the induction-phi that defines the induction-operand.  */
8529   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
8530   induction_phi = create_phi_node (vec_dest, iv_loop->header);
8531   induc_def = PHI_RESULT (induction_phi);
8532
8533   /* Create the iv update inside the loop  */
8534   stmts = NULL;
8535   vec_def = gimple_convert (&stmts, step_vectype, induc_def);
8536   vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
8537   vec_def = gimple_convert (&stmts, vectype, vec_def);
8538   gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8539   new_stmt = SSA_NAME_DEF_STMT (vec_def);
8540
8541   /* Set the arguments of the phi node:  */
8542   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
8543   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
8544                UNKNOWN_LOCATION);
8545
8546   STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
8547   *vec_stmt = induction_phi;
8548
8549   /* In case that vectorization factor (VF) is bigger than the number
8550      of elements that we can fit in a vectype (nunits), we have to generate
8551      more than one vector stmt - i.e - we need to "unroll" the
8552      vector stmt by a factor VF/nunits.  For more details see documentation
8553      in vectorizable_operation.  */
8554
8555   if (ncopies > 1)
8556     {
8557       gimple_seq seq = NULL;
8558       /* FORNOW. This restriction should be relaxed.  */
8559       gcc_assert (!nested_in_vect_loop);
8560
8561       /* Create the vector that holds the step of the induction.  */
8562       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
8563         {
8564           expr = build_int_cst (integer_type_node, nunits);
8565           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
8566         }
8567       else
8568         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
8569       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
8570                                expr, step_expr);
8571       if (seq)
8572         {
8573           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
8574           gcc_assert (!new_bb);
8575         }
8576
8577       t = unshare_expr (new_name);
8578       gcc_assert (CONSTANT_CLASS_P (new_name)
8579                   || TREE_CODE (new_name) == SSA_NAME);
8580       new_vec = build_vector_from_val (step_vectype, t);
8581       vec_step = vect_init_vector (loop_vinfo, stmt_info,
8582                                    new_vec, step_vectype, NULL);
8583
8584       vec_def = induc_def;
8585       for (i = 1; i < ncopies; i++)
8586         {
8587           /* vec_i = vec_prev + vec_step  */
8588           gimple_seq stmts = NULL;
8589           vec_def = gimple_convert (&stmts, step_vectype, vec_def);
8590           vec_def = gimple_build (&stmts,
8591                                   PLUS_EXPR, step_vectype, vec_def, vec_step);
8592           vec_def = gimple_convert (&stmts, vectype, vec_def);
8593
8594           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8595           new_stmt = SSA_NAME_DEF_STMT (vec_def);
8596           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
8597         }
8598     }
8599
8600   if (dump_enabled_p ())
8601     dump_printf_loc (MSG_NOTE, vect_location,
8602                      "transform induction: created def-use cycle: %G%G",
8603                      induction_phi, SSA_NAME_DEF_STMT (vec_def));
8604
8605   return true;
8606 }
8607
8608 /* Function vectorizable_live_operation.
8609
8610    STMT_INFO computes a value that is used outside the loop.  Check if
8611    it can be supported.  */
8612
8613 bool
8614 vectorizable_live_operation (vec_info *vinfo,
8615                              stmt_vec_info stmt_info,
8616                              gimple_stmt_iterator *gsi,
8617                              slp_tree slp_node, slp_instance slp_node_instance,
8618                              int slp_index, bool vec_stmt_p,
8619                              stmt_vector_for_cost *cost_vec)
8620 {
8621   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
8622   imm_use_iterator imm_iter;
8623   tree lhs, lhs_type, bitsize;
8624   tree vectype = (slp_node
8625                   ? SLP_TREE_VECTYPE (slp_node)
8626                   : STMT_VINFO_VECTYPE (stmt_info));
8627   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8628   int ncopies;
8629   gimple *use_stmt;
8630   auto_vec<tree> vec_oprnds;
8631   int vec_entry = 0;
8632   poly_uint64 vec_index = 0;
8633
8634   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
8635
8636   /* If a stmt of a reduction is live, vectorize it via
8637      vect_create_epilog_for_reduction.  vectorizable_reduction assessed
8638      validity so just trigger the transform here.  */
8639   if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
8640     {
8641       if (!vec_stmt_p)
8642         return true;
8643       if (slp_node)
8644         {
8645           /* For reduction chains the meta-info is attached to
8646              the group leader.  */
8647           if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
8648             stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
8649           /* For SLP reductions we vectorize the epilogue for
8650              all involved stmts together.  */
8651           else if (slp_index != 0)
8652             return true;
8653           else
8654             /* For SLP reductions the meta-info is attached to
8655                the representative.  */
8656             stmt_info = SLP_TREE_REPRESENTATIVE (slp_node);
8657         }
8658       stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8659       gcc_assert (reduc_info->is_reduc_info);
8660       if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
8661           || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
8662         return true;
8663       vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
8664                                         slp_node_instance);
8665       return true;
8666     }
8667
8668   /* If STMT is not relevant and it is a simple assignment and its inputs are
8669      invariant then it can remain in place, unvectorized.  The original last
8670      scalar value that it computes will be used.  */
8671   if (!STMT_VINFO_RELEVANT_P (stmt_info))
8672     {
8673       gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
8674       if (dump_enabled_p ())
8675         dump_printf_loc (MSG_NOTE, vect_location,
8676                          "statement is simple and uses invariant.  Leaving in "
8677                          "place.\n");
8678       return true;
8679     }
8680
8681   if (slp_node)
8682     ncopies = 1;
8683   else
8684     ncopies = vect_get_num_copies (loop_vinfo, vectype);
8685
8686   if (slp_node)
8687     {
8688       gcc_assert (slp_index >= 0);
8689
8690       /* Get the last occurrence of the scalar index from the concatenation of
8691          all the slp vectors. Calculate which slp vector it is and the index
8692          within.  */
8693       int num_scalar = SLP_TREE_LANES (slp_node);
8694       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8695       poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
8696
8697       /* Calculate which vector contains the result, and which lane of
8698          that vector we need.  */
8699       if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
8700         {
8701           if (dump_enabled_p ())
8702             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8703                              "Cannot determine which vector holds the"
8704                              " final result.\n");
8705           return false;
8706         }
8707     }
8708
8709   if (!vec_stmt_p)
8710     {
8711       /* No transformation required.  */
8712       if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
8713         {
8714           if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
8715                                                OPTIMIZE_FOR_SPEED))
8716             {
8717               if (dump_enabled_p ())
8718                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8719                                  "can't operate on partial vectors "
8720                                  "because the target doesn't support extract "
8721                                  "last reduction.\n");
8722               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8723             }
8724           else if (slp_node)
8725             {
8726               if (dump_enabled_p ())
8727                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8728                                  "can't operate on partial vectors "
8729                                  "because an SLP statement is live after "
8730                                  "the loop.\n");
8731               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8732             }
8733           else if (ncopies > 1)
8734             {
8735               if (dump_enabled_p ())
8736                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8737                                  "can't operate on partial vectors "
8738                                  "because ncopies is greater than 1.\n");
8739               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8740             }
8741           else
8742             {
8743               gcc_assert (ncopies == 1 && !slp_node);
8744               vect_record_loop_mask (loop_vinfo,
8745                                      &LOOP_VINFO_MASKS (loop_vinfo),
8746                                      1, vectype, NULL);
8747             }
8748         }
8749       /* ???  Enable for loop costing as well.  */
8750       if (!loop_vinfo)
8751         record_stmt_cost (cost_vec, 1, vec_to_scalar, stmt_info, NULL_TREE,
8752                           0, vect_epilogue);
8753       return true;
8754     }
8755
8756   /* Use the lhs of the original scalar statement.  */
8757   gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
8758   if (dump_enabled_p ())
8759     dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
8760                      "stmt %G", stmt);
8761
8762   lhs = gimple_get_lhs (stmt);
8763   lhs_type = TREE_TYPE (lhs);
8764
8765   bitsize = vector_element_bits_tree (vectype);
8766
8767   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
8768   tree vec_lhs, bitstart;
8769   gimple *vec_stmt;
8770   if (slp_node)
8771     {
8772       gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8773
8774       /* Get the correct slp vectorized stmt.  */
8775       vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry];
8776       vec_lhs = gimple_get_lhs (vec_stmt);
8777
8778       /* Get entry to use.  */
8779       bitstart = bitsize_int (vec_index);
8780       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
8781     }
8782   else
8783     {
8784       /* For multiple copies, get the last copy.  */
8785       vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info).last ();
8786       vec_lhs = gimple_get_lhs (vec_stmt);
8787
8788       /* Get the last lane in the vector.  */
8789       bitstart = int_const_binop (MULT_EXPR, bitsize, bitsize_int (nunits - 1));
8790     }
8791
8792   if (loop_vinfo)
8793     {
8794       /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
8795          requirement, insert one phi node for it.  It looks like:
8796            loop;
8797          BB:
8798            # lhs' = PHI <lhs>
8799          ==>
8800            loop;
8801          BB:
8802            # vec_lhs' = PHI <vec_lhs>
8803            new_tree = lane_extract <vec_lhs', ...>;
8804            lhs' = new_tree;  */
8805
8806       class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8807       basic_block exit_bb = single_exit (loop)->dest;
8808       gcc_assert (single_pred_p (exit_bb));
8809
8810       tree vec_lhs_phi = copy_ssa_name (vec_lhs);
8811       gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
8812       SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, vec_lhs);
8813
8814       gimple_seq stmts = NULL;
8815       tree new_tree;
8816       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8817         {
8818           /* Emit:
8819
8820                SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
8821
8822              where VEC_LHS is the vectorized live-out result and MASK is
8823              the loop mask for the final iteration.  */
8824           gcc_assert (ncopies == 1 && !slp_node);
8825           tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
8826           tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
8827                                           1, vectype, 0);
8828           tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
8829                                           mask, vec_lhs_phi);
8830
8831           /* Convert the extracted vector element to the scalar type.  */
8832           new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
8833         }
8834       else
8835         {
8836           tree bftype = TREE_TYPE (vectype);
8837           if (VECTOR_BOOLEAN_TYPE_P (vectype))
8838             bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8839           new_tree = build3 (BIT_FIELD_REF, bftype,
8840                              vec_lhs_phi, bitsize, bitstart);
8841           new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8842                                            &stmts, true, NULL_TREE);
8843         }
8844
8845       if (stmts)
8846         {
8847           gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb);
8848           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
8849
8850           /* Remove existing phi from lhs and create one copy from new_tree.  */
8851           tree lhs_phi = NULL_TREE;
8852           gimple_stmt_iterator gsi;
8853           for (gsi = gsi_start_phis (exit_bb);
8854                !gsi_end_p (gsi); gsi_next (&gsi))
8855             {
8856               gimple *phi = gsi_stmt (gsi);
8857               if ((gimple_phi_arg_def (phi, 0) == lhs))
8858                 {
8859                   remove_phi_node (&gsi, false);
8860                   lhs_phi = gimple_phi_result (phi);
8861                   gimple *copy = gimple_build_assign (lhs_phi, new_tree);
8862                   gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
8863                   break;
8864                 }
8865             }
8866         }
8867
8868       /* Replace use of lhs with newly computed result.  If the use stmt is a
8869          single arg PHI, just replace all uses of PHI result.  It's necessary
8870          because lcssa PHI defining lhs may be before newly inserted stmt.  */
8871       use_operand_p use_p;
8872       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8873         if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
8874             && !is_gimple_debug (use_stmt))
8875           {
8876             if (gimple_code (use_stmt) == GIMPLE_PHI
8877                 && gimple_phi_num_args (use_stmt) == 1)
8878               {
8879                 replace_uses_by (gimple_phi_result (use_stmt), new_tree);
8880               }
8881             else
8882               {
8883                 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8884                     SET_USE (use_p, new_tree);
8885               }
8886             update_stmt (use_stmt);
8887           }
8888     }
8889   else
8890     {
8891       /* For basic-block vectorization simply insert the lane-extraction.  */
8892       tree bftype = TREE_TYPE (vectype);
8893       if (VECTOR_BOOLEAN_TYPE_P (vectype))
8894         bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8895       tree new_tree = build3 (BIT_FIELD_REF, bftype,
8896                               vec_lhs, bitsize, bitstart);
8897       gimple_seq stmts = NULL;
8898       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8899                                        &stmts, true, NULL_TREE);
8900       if (TREE_CODE (new_tree) == SSA_NAME
8901           && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
8902         SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1;
8903       if (is_a <gphi *> (vec_stmt))
8904         {
8905           gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt));
8906           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8907         }
8908       else
8909         {
8910           gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
8911           gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
8912         }
8913
8914       /* Replace use of lhs with newly computed result.  If the use stmt is a
8915          single arg PHI, just replace all uses of PHI result.  It's necessary
8916          because lcssa PHI defining lhs may be before newly inserted stmt.  */
8917       use_operand_p use_p;
8918       stmt_vec_info use_stmt_info;
8919       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8920         if (!is_gimple_debug (use_stmt)
8921             && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
8922                 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
8923           {
8924             /* ???  This can happen when the live lane ends up being
8925                used in a vector construction code-generated by an
8926                external SLP node (and code-generation for that already
8927                happened).  See gcc.dg/vect/bb-slp-47.c.
8928                Doing this is what would happen if that vector CTOR
8929                were not code-generated yet so it is not too bad.
8930                ???  In fact we'd likely want to avoid this situation
8931                in the first place.  */
8932             if (TREE_CODE (new_tree) == SSA_NAME
8933                 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
8934                 && gimple_code (use_stmt) != GIMPLE_PHI
8935                 && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
8936                                                 use_stmt))
8937               {
8938                 enum tree_code code = gimple_assign_rhs_code (use_stmt);
8939                 gcc_assert (code == CONSTRUCTOR
8940                             || code == VIEW_CONVERT_EXPR
8941                             || CONVERT_EXPR_CODE_P (code));
8942                 if (dump_enabled_p ())
8943                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8944                                    "Using original scalar computation for "
8945                                    "live lane because use preceeds vector "
8946                                    "def\n");
8947                 continue;
8948               }
8949             /* ???  It can also happen that we end up pulling a def into
8950                a loop where replacing out-of-loop uses would require
8951                a new LC SSA PHI node.  Retain the original scalar in
8952                those cases as well.  PR98064.  */
8953             if (TREE_CODE (new_tree) == SSA_NAME
8954                 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
8955                 && (gimple_bb (use_stmt)->loop_father
8956                     != gimple_bb (vec_stmt)->loop_father)
8957                 && !flow_loop_nested_p (gimple_bb (vec_stmt)->loop_father,
8958                                         gimple_bb (use_stmt)->loop_father))
8959               {
8960                 if (dump_enabled_p ())
8961                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8962                                    "Using original scalar computation for "
8963                                    "live lane because there is an out-of-loop "
8964                                    "definition for it\n");
8965                 continue;
8966               }
8967             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8968               SET_USE (use_p, new_tree);
8969             update_stmt (use_stmt);
8970           }
8971     }
8972
8973   return true;
8974 }
8975
8976 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO.  */
8977
8978 static void
8979 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
8980 {
8981   ssa_op_iter op_iter;
8982   imm_use_iterator imm_iter;
8983   def_operand_p def_p;
8984   gimple *ustmt;
8985
8986   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
8987     {
8988       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
8989         {
8990           basic_block bb;
8991
8992           if (!is_gimple_debug (ustmt))
8993             continue;
8994
8995           bb = gimple_bb (ustmt);
8996
8997           if (!flow_bb_inside_loop_p (loop, bb))
8998             {
8999               if (gimple_debug_bind_p (ustmt))
9000                 {
9001                   if (dump_enabled_p ())
9002                     dump_printf_loc (MSG_NOTE, vect_location,
9003                                      "killing debug use\n");
9004
9005                   gimple_debug_bind_reset_value (ustmt);
9006                   update_stmt (ustmt);
9007                 }
9008               else
9009                 gcc_unreachable ();
9010             }
9011         }
9012     }
9013 }
9014
9015 /* Given loop represented by LOOP_VINFO, return true if computation of
9016    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
9017    otherwise.  */
9018
9019 static bool
9020 loop_niters_no_overflow (loop_vec_info loop_vinfo)
9021 {
9022   /* Constant case.  */
9023   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
9024     {
9025       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
9026       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
9027
9028       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
9029       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
9030       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
9031         return true;
9032     }
9033
9034   widest_int max;
9035   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9036   /* Check the upper bound of loop niters.  */
9037   if (get_max_loop_iterations (loop, &max))
9038     {
9039       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
9040       signop sgn = TYPE_SIGN (type);
9041       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
9042       if (max < type_max)
9043         return true;
9044     }
9045   return false;
9046 }
9047
9048 /* Return a mask type with half the number of elements as OLD_TYPE,
9049    given that it should have mode NEW_MODE.  */
9050
9051 tree
9052 vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
9053 {
9054   poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
9055   return build_truth_vector_type_for_mode (nunits, new_mode);
9056 }
9057
9058 /* Return a mask type with twice as many elements as OLD_TYPE,
9059    given that it should have mode NEW_MODE.  */
9060
9061 tree
9062 vect_double_mask_nunits (tree old_type, machine_mode new_mode)
9063 {
9064   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
9065   return build_truth_vector_type_for_mode (nunits, new_mode);
9066 }
9067
9068 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
9069    contain a sequence of NVECTORS masks that each control a vector of type
9070    VECTYPE.  If SCALAR_MASK is nonnull, the fully-masked loop would AND
9071    these vector masks with the vector version of SCALAR_MASK.  */
9072
9073 void
9074 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
9075                        unsigned int nvectors, tree vectype, tree scalar_mask)
9076 {
9077   gcc_assert (nvectors != 0);
9078   if (masks->length () < nvectors)
9079     masks->safe_grow_cleared (nvectors, true);
9080   rgroup_controls *rgm = &(*masks)[nvectors - 1];
9081   /* The number of scalars per iteration and the number of vectors are
9082      both compile-time constants.  */
9083   unsigned int nscalars_per_iter
9084     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
9085                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
9086
9087   if (scalar_mask)
9088     {
9089       scalar_cond_masked_key cond (scalar_mask, nvectors);
9090       loop_vinfo->scalar_cond_masked_set.add (cond);
9091     }
9092
9093   if (rgm->max_nscalars_per_iter < nscalars_per_iter)
9094     {
9095       rgm->max_nscalars_per_iter = nscalars_per_iter;
9096       rgm->type = truth_type_for (vectype);
9097       rgm->factor = 1;
9098     }
9099 }
9100
9101 /* Given a complete set of masks MASKS, extract mask number INDEX
9102    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
9103    where 0 <= INDEX < NVECTORS.  Insert any set-up statements before GSI.
9104
9105    See the comment above vec_loop_masks for more details about the mask
9106    arrangement.  */
9107
9108 tree
9109 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
9110                     unsigned int nvectors, tree vectype, unsigned int index)
9111 {
9112   rgroup_controls *rgm = &(*masks)[nvectors - 1];
9113   tree mask_type = rgm->type;
9114
9115   /* Populate the rgroup's mask array, if this is the first time we've
9116      used it.  */
9117   if (rgm->controls.is_empty ())
9118     {
9119       rgm->controls.safe_grow_cleared (nvectors, true);
9120       for (unsigned int i = 0; i < nvectors; ++i)
9121         {
9122           tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
9123           /* Provide a dummy definition until the real one is available.  */
9124           SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
9125           rgm->controls[i] = mask;
9126         }
9127     }
9128
9129   tree mask = rgm->controls[index];
9130   if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
9131                 TYPE_VECTOR_SUBPARTS (vectype)))
9132     {
9133       /* A loop mask for data type X can be reused for data type Y
9134          if X has N times more elements than Y and if Y's elements
9135          are N times bigger than X's.  In this case each sequence
9136          of N elements in the loop mask will be all-zero or all-one.
9137          We can then view-convert the mask so that each sequence of
9138          N elements is replaced by a single element.  */
9139       gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
9140                               TYPE_VECTOR_SUBPARTS (vectype)));
9141       gimple_seq seq = NULL;
9142       mask_type = truth_type_for (vectype);
9143       mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
9144       if (seq)
9145         gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
9146     }
9147   return mask;
9148 }
9149
9150 /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
9151    lengths for controlling an operation on VECTYPE.  The operation splits
9152    each element of VECTYPE into FACTOR separate subelements, measuring the
9153    length as a number of these subelements.  */
9154
9155 void
9156 vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
9157                       unsigned int nvectors, tree vectype, unsigned int factor)
9158 {
9159   gcc_assert (nvectors != 0);
9160   if (lens->length () < nvectors)
9161     lens->safe_grow_cleared (nvectors, true);
9162   rgroup_controls *rgl = &(*lens)[nvectors - 1];
9163
9164   /* The number of scalars per iteration, scalar occupied bytes and
9165      the number of vectors are both compile-time constants.  */
9166   unsigned int nscalars_per_iter
9167     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
9168                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
9169
9170   if (rgl->max_nscalars_per_iter < nscalars_per_iter)
9171     {
9172       /* For now, we only support cases in which all loads and stores fall back
9173          to VnQI or none do.  */
9174       gcc_assert (!rgl->max_nscalars_per_iter
9175                   || (rgl->factor == 1 && factor == 1)
9176                   || (rgl->max_nscalars_per_iter * rgl->factor
9177                       == nscalars_per_iter * factor));
9178       rgl->max_nscalars_per_iter = nscalars_per_iter;
9179       rgl->type = vectype;
9180       rgl->factor = factor;
9181     }
9182 }
9183
9184 /* Given a complete set of length LENS, extract length number INDEX for an
9185    rgroup that operates on NVECTORS vectors, where 0 <= INDEX < NVECTORS.  */
9186
9187 tree
9188 vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
9189                    unsigned int nvectors, unsigned int index)
9190 {
9191   rgroup_controls *rgl = &(*lens)[nvectors - 1];
9192
9193   /* Populate the rgroup's len array, if this is the first time we've
9194      used it.  */
9195   if (rgl->controls.is_empty ())
9196     {
9197       rgl->controls.safe_grow_cleared (nvectors, true);
9198       for (unsigned int i = 0; i < nvectors; ++i)
9199         {
9200           tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
9201           gcc_assert (len_type != NULL_TREE);
9202           tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
9203
9204           /* Provide a dummy definition until the real one is available.  */
9205           SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
9206           rgl->controls[i] = len;
9207         }
9208     }
9209
9210   return rgl->controls[index];
9211 }
9212
9213 /* Scale profiling counters by estimation for LOOP which is vectorized
9214    by factor VF.  */
9215
9216 static void
9217 scale_profile_for_vect_loop (class loop *loop, unsigned vf)
9218 {
9219   edge preheader = loop_preheader_edge (loop);
9220   /* Reduce loop iterations by the vectorization factor.  */
9221   gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
9222   profile_count freq_h = loop->header->count, freq_e = preheader->count ();
9223
9224   if (freq_h.nonzero_p ())
9225     {
9226       profile_probability p;
9227
9228       /* Avoid dropping loop body profile counter to 0 because of zero count
9229          in loop's preheader.  */
9230       if (!(freq_e == profile_count::zero ()))
9231         freq_e = freq_e.force_nonzero ();
9232       p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
9233       scale_loop_frequencies (loop, p);
9234     }
9235
9236   edge exit_e = single_exit (loop);
9237   exit_e->probability = profile_probability::always ()
9238                                  .apply_scale (1, new_est_niter + 1);
9239
9240   edge exit_l = single_pred_edge (loop->latch);
9241   profile_probability prob = exit_l->probability;
9242   exit_l->probability = exit_e->probability.invert ();
9243   if (prob.initialized_p () && exit_l->probability.initialized_p ())
9244     scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
9245 }
9246
9247 /* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
9248    latch edge values originally defined by it.  */
9249
9250 static void
9251 maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo,
9252                                      stmt_vec_info def_stmt_info)
9253 {
9254   tree def = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
9255   if (!def || TREE_CODE (def) != SSA_NAME)
9256     return;
9257   stmt_vec_info phi_info;
9258   imm_use_iterator iter;
9259   use_operand_p use_p;
9260   FOR_EACH_IMM_USE_FAST (use_p, iter, def)
9261     if (gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p)))
9262       if (gimple_bb (phi)->loop_father->header == gimple_bb (phi)
9263           && (phi_info = loop_vinfo->lookup_stmt (phi))
9264           && STMT_VINFO_RELEVANT_P (phi_info)
9265           && VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info))
9266           && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION
9267           && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION)
9268         {
9269           loop_p loop = gimple_bb (phi)->loop_father;
9270           edge e = loop_latch_edge (loop);
9271           if (PHI_ARG_DEF_FROM_EDGE (phi, e) == def)
9272             {
9273               vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
9274               vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
9275               gcc_assert (phi_defs.length () == latch_defs.length ());
9276               for (unsigned i = 0; i < phi_defs.length (); ++i)
9277                 add_phi_arg (as_a <gphi *> (phi_defs[i]),
9278                              gimple_get_lhs (latch_defs[i]), e,
9279                              gimple_phi_arg_location (phi, e->dest_idx));
9280             }
9281         }
9282 }
9283
9284 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
9285    When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
9286    stmt_vec_info.  */
9287
9288 static bool
9289 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
9290                           gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
9291 {
9292   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9293   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9294
9295   if (dump_enabled_p ())
9296     dump_printf_loc (MSG_NOTE, vect_location,
9297                      "------>vectorizing statement: %G", stmt_info->stmt);
9298
9299   if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
9300     vect_loop_kill_debug_uses (loop, stmt_info);
9301
9302   if (!STMT_VINFO_RELEVANT_P (stmt_info)
9303       && !STMT_VINFO_LIVE_P (stmt_info))
9304     return false;
9305
9306   if (STMT_VINFO_VECTYPE (stmt_info))
9307     {
9308       poly_uint64 nunits
9309         = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
9310       if (!STMT_SLP_TYPE (stmt_info)
9311           && maybe_ne (nunits, vf)
9312           && dump_enabled_p ())
9313         /* For SLP VF is set according to unrolling factor, and not
9314            to vector size, hence for SLP this print is not valid.  */
9315         dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
9316     }
9317
9318   /* Pure SLP statements have already been vectorized.  We still need
9319      to apply loop vectorization to hybrid SLP statements.  */
9320   if (PURE_SLP_STMT (stmt_info))
9321     return false;
9322
9323   if (dump_enabled_p ())
9324     dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
9325
9326   if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL))
9327     *seen_store = stmt_info;
9328
9329   return true;
9330 }
9331
9332 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
9333    in the hash_map with its corresponding values.  */
9334
9335 static tree
9336 find_in_mapping (tree t, void *context)
9337 {
9338   hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
9339
9340   tree *value = mapping->get (t);
9341   return value ? *value : t;
9342 }
9343
9344 /* Update EPILOGUE's loop_vec_info.  EPILOGUE was constructed as a copy of the
9345    original loop that has now been vectorized.
9346
9347    The inits of the data_references need to be advanced with the number of
9348    iterations of the main loop.  This has been computed in vect_do_peeling and
9349    is stored in parameter ADVANCE.  We first restore the data_references
9350    initial offset with the values recored in ORIG_DRS_INIT.
9351
9352    Since the loop_vec_info of this EPILOGUE was constructed for the original
9353    loop, its stmt_vec_infos all point to the original statements.  These need
9354    to be updated to point to their corresponding copies as well as the SSA_NAMES
9355    in their PATTERN_DEF_SEQs and RELATED_STMTs.
9356
9357    The data_reference's connections also need to be updated.  Their
9358    corresponding dr_vec_info need to be reconnected to the EPILOGUE's
9359    stmt_vec_infos, their statements need to point to their corresponding copy,
9360    if they are gather loads or scatter stores then their reference needs to be
9361    updated to point to its corresponding copy and finally we set
9362    'base_misaligned' to false as we have already peeled for alignment in the
9363    prologue of the main loop.  */
9364
9365 static void
9366 update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
9367 {
9368   loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
9369   auto_vec<gimple *> stmt_worklist;
9370   hash_map<tree,tree> mapping;
9371   gimple *orig_stmt, *new_stmt;
9372   gimple_stmt_iterator epilogue_gsi;
9373   gphi_iterator epilogue_phi_gsi;
9374   stmt_vec_info stmt_vinfo = NULL, related_vinfo;
9375   basic_block *epilogue_bbs = get_loop_body (epilogue);
9376   unsigned i;
9377
9378   free (LOOP_VINFO_BBS (epilogue_vinfo));
9379   LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
9380
9381   /* Advance data_reference's with the number of iterations of the previous
9382      loop and its prologue.  */
9383   vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
9384
9385
9386   /* The EPILOGUE loop is a copy of the original loop so they share the same
9387      gimple UIDs.  In this loop we update the loop_vec_info of the EPILOGUE to
9388      point to the copied statements.  We also create a mapping of all LHS' in
9389      the original loop and all the LHS' in the EPILOGUE and create worklists to
9390      update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs.  */
9391   for (unsigned i = 0; i < epilogue->num_nodes; ++i)
9392     {
9393       for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
9394            !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
9395         {
9396           new_stmt = epilogue_phi_gsi.phi ();
9397
9398           gcc_assert (gimple_uid (new_stmt) > 0);
9399           stmt_vinfo
9400             = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
9401
9402           orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
9403           STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
9404
9405           mapping.put (gimple_phi_result (orig_stmt),
9406                        gimple_phi_result (new_stmt));
9407           /* PHI nodes can not have patterns or related statements.  */
9408           gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
9409                       && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
9410         }
9411
9412       for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
9413            !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
9414         {
9415           new_stmt = gsi_stmt (epilogue_gsi);
9416           if (is_gimple_debug (new_stmt))
9417             continue;
9418
9419           gcc_assert (gimple_uid (new_stmt) > 0);
9420           stmt_vinfo
9421             = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
9422
9423           orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
9424           STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
9425
9426           if (tree old_lhs = gimple_get_lhs (orig_stmt))
9427             mapping.put (old_lhs, gimple_get_lhs (new_stmt));
9428
9429           if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
9430             {
9431               gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
9432               for (gimple_stmt_iterator gsi = gsi_start (seq);
9433                    !gsi_end_p (gsi); gsi_next (&gsi))
9434                 stmt_worklist.safe_push (gsi_stmt (gsi));
9435             }
9436
9437           related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
9438           if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
9439             {
9440               gimple *stmt = STMT_VINFO_STMT (related_vinfo);
9441               stmt_worklist.safe_push (stmt);
9442               /* Set BB such that the assert in
9443                 'get_initial_def_for_reduction' is able to determine that
9444                 the BB of the related stmt is inside this loop.  */
9445               gimple_set_bb (stmt,
9446                              gimple_bb (new_stmt));
9447               related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
9448               gcc_assert (related_vinfo == NULL
9449                           || related_vinfo == stmt_vinfo);
9450             }
9451         }
9452     }
9453
9454   /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
9455      using the original main loop and thus need to be updated to refer to the
9456      cloned variables used in the epilogue.  */
9457   for (unsigned i = 0; i < stmt_worklist.length (); ++i)
9458     {
9459       gimple *stmt = stmt_worklist[i];
9460       tree *new_op;
9461
9462       for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
9463         {
9464           tree op = gimple_op (stmt, j);
9465           if ((new_op = mapping.get(op)))
9466             gimple_set_op (stmt, j, *new_op);
9467           else
9468             {
9469               /* PR92429: The last argument of simplify_replace_tree disables
9470                  folding when replacing arguments.  This is required as
9471                  otherwise you might end up with different statements than the
9472                  ones analyzed in vect_loop_analyze, leading to different
9473                  vectorization.  */
9474               op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
9475                                           &find_in_mapping, &mapping, false);
9476               gimple_set_op (stmt, j, op);
9477             }
9478         }
9479     }
9480
9481   struct data_reference *dr;
9482   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
9483   FOR_EACH_VEC_ELT (datarefs, i, dr)
9484     {
9485       orig_stmt = DR_STMT (dr);
9486       gcc_assert (gimple_uid (orig_stmt) > 0);
9487       stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
9488       /* Data references for gather loads and scatter stores do not use the
9489          updated offset we set using ADVANCE.  Instead we have to make sure the
9490          reference in the data references point to the corresponding copy of
9491          the original in the epilogue.  */
9492       if (STMT_VINFO_MEMORY_ACCESS_TYPE (vect_stmt_to_vectorize (stmt_vinfo))
9493           == VMAT_GATHER_SCATTER)
9494         {
9495           DR_REF (dr)
9496             = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
9497                                      &find_in_mapping, &mapping);
9498           DR_BASE_ADDRESS (dr)
9499             = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
9500                                      &find_in_mapping, &mapping);
9501         }
9502       DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
9503       stmt_vinfo->dr_aux.stmt = stmt_vinfo;
9504       /* The vector size of the epilogue is smaller than that of the main loop
9505          so the alignment is either the same or lower. This means the dr will
9506          thus by definition be aligned.  */
9507       STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false;
9508     }
9509
9510   epilogue_vinfo->shared->datarefs_copy.release ();
9511   epilogue_vinfo->shared->save_datarefs ();
9512 }
9513
9514 /* Function vect_transform_loop.
9515
9516    The analysis phase has determined that the loop is vectorizable.
9517    Vectorize the loop - created vectorized stmts to replace the scalar
9518    stmts in the loop, and update the loop exit condition.
9519    Returns scalar epilogue loop if any.  */
9520
9521 class loop *
9522 vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
9523 {
9524   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9525   class loop *epilogue = NULL;
9526   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
9527   int nbbs = loop->num_nodes;
9528   int i;
9529   tree niters_vector = NULL_TREE;
9530   tree step_vector = NULL_TREE;
9531   tree niters_vector_mult_vf = NULL_TREE;
9532   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9533   unsigned int lowest_vf = constant_lower_bound (vf);
9534   gimple *stmt;
9535   bool check_profitability = false;
9536   unsigned int th;
9537
9538   DUMP_VECT_SCOPE ("vec_transform_loop");
9539
9540   loop_vinfo->shared->check_datarefs ();
9541
9542   /* Use the more conservative vectorization threshold.  If the number
9543      of iterations is constant assume the cost check has been performed
9544      by our caller.  If the threshold makes all loops profitable that
9545      run at least the (estimated) vectorization factor number of times
9546      checking is pointless, too.  */
9547   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
9548   if (vect_apply_runtime_profitability_check_p (loop_vinfo))
9549     {
9550       if (dump_enabled_p ())
9551         dump_printf_loc (MSG_NOTE, vect_location,
9552                          "Profitability threshold is %d loop iterations.\n",
9553                          th);
9554       check_profitability = true;
9555     }
9556
9557   /* Make sure there exists a single-predecessor exit bb.  Do this before
9558      versioning.   */
9559   edge e = single_exit (loop);
9560   if (! single_pred_p (e->dest))
9561     {
9562       split_loop_exit_edge (e, true);
9563       if (dump_enabled_p ())
9564         dump_printf (MSG_NOTE, "split exit edge\n");
9565     }
9566
9567   /* Version the loop first, if required, so the profitability check
9568      comes first.  */
9569
9570   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
9571     {
9572       class loop *sloop
9573         = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
9574       sloop->force_vectorize = false;
9575       check_profitability = false;
9576     }
9577
9578   /* Make sure there exists a single-predecessor exit bb also on the
9579      scalar loop copy.  Do this after versioning but before peeling
9580      so CFG structure is fine for both scalar and if-converted loop
9581      to make slpeel_duplicate_current_defs_from_edges face matched
9582      loop closed PHI nodes on the exit.  */
9583   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
9584     {
9585       e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
9586       if (! single_pred_p (e->dest))
9587         {
9588           split_loop_exit_edge (e, true);
9589           if (dump_enabled_p ())
9590             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
9591         }
9592     }
9593
9594   tree niters = vect_build_loop_niters (loop_vinfo);
9595   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
9596   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
9597   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
9598   tree advance;
9599   drs_init_vec orig_drs_init;
9600
9601   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
9602                               &step_vector, &niters_vector_mult_vf, th,
9603                               check_profitability, niters_no_overflow,
9604                               &advance);
9605
9606   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
9607       && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
9608     scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
9609                             LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
9610
9611   if (niters_vector == NULL_TREE)
9612     {
9613       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
9614           && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
9615           && known_eq (lowest_vf, vf))
9616         {
9617           niters_vector
9618             = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
9619                              LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
9620           step_vector = build_one_cst (TREE_TYPE (niters));
9621         }
9622       else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
9623         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
9624                                      &step_vector, niters_no_overflow);
9625       else
9626         /* vect_do_peeling subtracted the number of peeled prologue
9627            iterations from LOOP_VINFO_NITERS.  */
9628         vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
9629                                      &niters_vector, &step_vector,
9630                                      niters_no_overflow);
9631     }
9632
9633   /* 1) Make sure the loop header has exactly two entries
9634      2) Make sure we have a preheader basic block.  */
9635
9636   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
9637
9638   split_edge (loop_preheader_edge (loop));
9639
9640   if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
9641     /* This will deal with any possible peeling.  */
9642     vect_prepare_for_masked_peels (loop_vinfo);
9643
9644   /* Schedule the SLP instances first, then handle loop vectorization
9645      below.  */
9646   if (!loop_vinfo->slp_instances.is_empty ())
9647     {
9648       DUMP_VECT_SCOPE ("scheduling SLP instances");
9649       vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
9650     }
9651
9652   /* FORNOW: the vectorizer supports only loops which body consist
9653      of one basic block (header + empty latch). When the vectorizer will
9654      support more involved loop forms, the order by which the BBs are
9655      traversed need to be reconsidered.  */
9656
9657   for (i = 0; i < nbbs; i++)
9658     {
9659       basic_block bb = bbs[i];
9660       stmt_vec_info stmt_info;
9661
9662       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
9663            gsi_next (&si))
9664         {
9665           gphi *phi = si.phi ();
9666           if (dump_enabled_p ())
9667             dump_printf_loc (MSG_NOTE, vect_location,
9668                              "------>vectorizing phi: %G", phi);
9669           stmt_info = loop_vinfo->lookup_stmt (phi);
9670           if (!stmt_info)
9671             continue;
9672
9673           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
9674             vect_loop_kill_debug_uses (loop, stmt_info);
9675
9676           if (!STMT_VINFO_RELEVANT_P (stmt_info)
9677               && !STMT_VINFO_LIVE_P (stmt_info))
9678             continue;
9679
9680           if (STMT_VINFO_VECTYPE (stmt_info)
9681               && (maybe_ne
9682                   (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
9683               && dump_enabled_p ())
9684             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
9685
9686           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
9687                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
9688                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
9689                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
9690                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
9691               && ! PURE_SLP_STMT (stmt_info))
9692             {
9693               if (dump_enabled_p ())
9694                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
9695               vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL);
9696             }
9697         }
9698
9699       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
9700            gsi_next (&si))
9701         {
9702           gphi *phi = si.phi ();
9703           stmt_info = loop_vinfo->lookup_stmt (phi);
9704           if (!stmt_info)
9705             continue;
9706
9707           if (!STMT_VINFO_RELEVANT_P (stmt_info)
9708               && !STMT_VINFO_LIVE_P (stmt_info))
9709             continue;
9710
9711           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
9712                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
9713                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
9714                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
9715                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
9716               && ! PURE_SLP_STMT (stmt_info))
9717             maybe_set_vectorized_backedge_value (loop_vinfo, stmt_info);
9718         }
9719
9720       for (gimple_stmt_iterator si = gsi_start_bb (bb);
9721            !gsi_end_p (si);)
9722         {
9723           stmt = gsi_stmt (si);
9724           /* During vectorization remove existing clobber stmts.  */
9725           if (gimple_clobber_p (stmt))
9726             {
9727               unlink_stmt_vdef (stmt);
9728               gsi_remove (&si, true);
9729               release_defs (stmt);
9730             }
9731           else
9732             {
9733               /* Ignore vector stmts created in the outer loop.  */
9734               stmt_info = loop_vinfo->lookup_stmt (stmt);
9735
9736               /* vector stmts created in the outer-loop during vectorization of
9737                  stmts in an inner-loop may not have a stmt_info, and do not
9738                  need to be vectorized.  */
9739               stmt_vec_info seen_store = NULL;
9740               if (stmt_info)
9741                 {
9742                   if (STMT_VINFO_IN_PATTERN_P (stmt_info))
9743                     {
9744                       gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
9745                       for (gimple_stmt_iterator subsi = gsi_start (def_seq);
9746                            !gsi_end_p (subsi); gsi_next (&subsi))
9747                         {
9748                           stmt_vec_info pat_stmt_info
9749                             = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
9750                           vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
9751                                                     &si, &seen_store);
9752                         }
9753                       stmt_vec_info pat_stmt_info
9754                         = STMT_VINFO_RELATED_STMT (stmt_info);
9755                       if (vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
9756                                                     &si, &seen_store))
9757                         maybe_set_vectorized_backedge_value (loop_vinfo,
9758                                                              pat_stmt_info);
9759                     }
9760                   else
9761                     {
9762                       if (vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
9763                                                     &seen_store))
9764                         maybe_set_vectorized_backedge_value (loop_vinfo,
9765                                                              stmt_info);
9766                     }
9767                 }
9768               gsi_next (&si);
9769               if (seen_store)
9770                 {
9771                   if (STMT_VINFO_GROUPED_ACCESS (seen_store))
9772                     /* Interleaving.  If IS_STORE is TRUE, the
9773                        vectorization of the interleaving chain was
9774                        completed - free all the stores in the chain.  */
9775                     vect_remove_stores (loop_vinfo,
9776                                         DR_GROUP_FIRST_ELEMENT (seen_store));
9777                   else
9778                     /* Free the attached stmt_vec_info and remove the stmt.  */
9779                     loop_vinfo->remove_stmt (stmt_info);
9780                 }
9781             }
9782         }
9783
9784       /* Stub out scalar statements that must not survive vectorization.
9785          Doing this here helps with grouped statements, or statements that
9786          are involved in patterns.  */
9787       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
9788            !gsi_end_p (gsi); gsi_next (&gsi))
9789         {
9790           gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
9791           if (!call || !gimple_call_internal_p (call))
9792             continue;
9793           internal_fn ifn = gimple_call_internal_fn (call);
9794           if (ifn == IFN_MASK_LOAD)
9795             {
9796               tree lhs = gimple_get_lhs (call);
9797               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9798                 {
9799                   tree zero = build_zero_cst (TREE_TYPE (lhs));
9800                   gimple *new_stmt = gimple_build_assign (lhs, zero);
9801                   gsi_replace (&gsi, new_stmt, true);
9802                 }
9803             }
9804           else if (conditional_internal_fn_code (ifn) != ERROR_MARK)
9805             {
9806               tree lhs = gimple_get_lhs (call);
9807               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9808                 {
9809                   tree else_arg
9810                     = gimple_call_arg (call, gimple_call_num_args (call) - 1);
9811                   gimple *new_stmt = gimple_build_assign (lhs, else_arg);
9812                   gsi_replace (&gsi, new_stmt, true);
9813                 }
9814             }
9815         }
9816     }                           /* BBs in loop */
9817
9818   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
9819      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
9820   if (integer_onep (step_vector))
9821     niters_no_overflow = true;
9822   vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
9823                            niters_vector_mult_vf, !niters_no_overflow);
9824
9825   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
9826   scale_profile_for_vect_loop (loop, assumed_vf);
9827
9828   /* True if the final iteration might not handle a full vector's
9829      worth of scalar iterations.  */
9830   bool final_iter_may_be_partial
9831     = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo);
9832   /* The minimum number of iterations performed by the epilogue.  This
9833      is 1 when peeling for gaps because we always need a final scalar
9834      iteration.  */
9835   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
9836   /* +1 to convert latch counts to loop iteration counts,
9837      -min_epilogue_iters to remove iterations that cannot be performed
9838        by the vector code.  */
9839   int bias_for_lowest = 1 - min_epilogue_iters;
9840   int bias_for_assumed = bias_for_lowest;
9841   int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
9842   if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
9843     {
9844       /* When the amount of peeling is known at compile time, the first
9845          iteration will have exactly alignment_npeels active elements.
9846          In the worst case it will have at least one.  */
9847       int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
9848       bias_for_lowest += lowest_vf - min_first_active;
9849       bias_for_assumed += assumed_vf - min_first_active;
9850     }
9851   /* In these calculations the "- 1" converts loop iteration counts
9852      back to latch counts.  */
9853   if (loop->any_upper_bound)
9854     {
9855       loop_vec_info main_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
9856       loop->nb_iterations_upper_bound
9857         = (final_iter_may_be_partial
9858            ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
9859                             lowest_vf) - 1
9860            : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
9861                              lowest_vf) - 1);
9862       if (main_vinfo)
9863         {
9864           unsigned int bound;
9865           poly_uint64 main_iters
9866             = upper_bound (LOOP_VINFO_VECT_FACTOR (main_vinfo),
9867                            LOOP_VINFO_COST_MODEL_THRESHOLD (main_vinfo));
9868           main_iters
9869             = upper_bound (main_iters,
9870                            LOOP_VINFO_VERSIONING_THRESHOLD (main_vinfo));
9871           if (can_div_away_from_zero_p (main_iters,
9872                                         LOOP_VINFO_VECT_FACTOR (loop_vinfo),
9873                                         &bound))
9874             loop->nb_iterations_upper_bound
9875               = wi::umin ((widest_int) (bound - 1),
9876                           loop->nb_iterations_upper_bound);
9877       }
9878   }
9879   if (loop->any_likely_upper_bound)
9880     loop->nb_iterations_likely_upper_bound
9881       = (final_iter_may_be_partial
9882          ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
9883                           + bias_for_lowest, lowest_vf) - 1
9884          : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
9885                            + bias_for_lowest, lowest_vf) - 1);
9886   if (loop->any_estimate)
9887     loop->nb_iterations_estimate
9888       = (final_iter_may_be_partial
9889          ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
9890                           assumed_vf) - 1
9891          : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
9892                            assumed_vf) - 1);
9893
9894   if (dump_enabled_p ())
9895     {
9896       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
9897         {
9898           dump_printf_loc (MSG_NOTE, vect_location,
9899                            "LOOP VECTORIZED\n");
9900           if (loop->inner)
9901             dump_printf_loc (MSG_NOTE, vect_location,
9902                              "OUTER LOOP VECTORIZED\n");
9903           dump_printf (MSG_NOTE, "\n");
9904         }
9905       else
9906         dump_printf_loc (MSG_NOTE, vect_location,
9907                          "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
9908                          GET_MODE_NAME (loop_vinfo->vector_mode));
9909     }
9910
9911   /* Loops vectorized with a variable factor won't benefit from
9912      unrolling/peeling.  */
9913   if (!vf.is_constant ())
9914     {
9915       loop->unroll = 1;
9916       if (dump_enabled_p ())
9917         dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
9918                          " variable-length vectorization factor\n");
9919     }
9920   /* Free SLP instances here because otherwise stmt reference counting
9921      won't work.  */
9922   slp_instance instance;
9923   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
9924     vect_free_slp_instance (instance);
9925   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
9926   /* Clear-up safelen field since its value is invalid after vectorization
9927      since vectorized loop can have loop-carried dependencies.  */
9928   loop->safelen = 0;
9929
9930   if (epilogue)
9931     {
9932       update_epilogue_loop_vinfo (epilogue, advance);
9933
9934       epilogue->simduid = loop->simduid;
9935       epilogue->force_vectorize = loop->force_vectorize;
9936       epilogue->dont_vectorize = false;
9937     }
9938
9939   return epilogue;
9940 }
9941
9942 /* The code below is trying to perform simple optimization - revert
9943    if-conversion for masked stores, i.e. if the mask of a store is zero
9944    do not perform it and all stored value producers also if possible.
9945    For example,
9946      for (i=0; i<n; i++)
9947        if (c[i])
9948         {
9949           p1[i] += 1;
9950           p2[i] = p3[i] +2;
9951         }
9952    this transformation will produce the following semi-hammock:
9953
9954    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
9955      {
9956        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
9957        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
9958        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
9959        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
9960        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
9961        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
9962      }
9963 */
9964
9965 void
9966 optimize_mask_stores (class loop *loop)
9967 {
9968   basic_block *bbs = get_loop_body (loop);
9969   unsigned nbbs = loop->num_nodes;
9970   unsigned i;
9971   basic_block bb;
9972   class loop *bb_loop;
9973   gimple_stmt_iterator gsi;
9974   gimple *stmt;
9975   auto_vec<gimple *> worklist;
9976   auto_purge_vect_location sentinel;
9977
9978   vect_location = find_loop_location (loop);
9979   /* Pick up all masked stores in loop if any.  */
9980   for (i = 0; i < nbbs; i++)
9981     {
9982       bb = bbs[i];
9983       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
9984            gsi_next (&gsi))
9985         {
9986           stmt = gsi_stmt (gsi);
9987           if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
9988             worklist.safe_push (stmt);
9989         }
9990     }
9991
9992   free (bbs);
9993   if (worklist.is_empty ())
9994     return;
9995
9996   /* Loop has masked stores.  */
9997   while (!worklist.is_empty ())
9998     {
9999       gimple *last, *last_store;
10000       edge e, efalse;
10001       tree mask;
10002       basic_block store_bb, join_bb;
10003       gimple_stmt_iterator gsi_to;
10004       tree vdef, new_vdef;
10005       gphi *phi;
10006       tree vectype;
10007       tree zero;
10008
10009       last = worklist.pop ();
10010       mask = gimple_call_arg (last, 2);
10011       bb = gimple_bb (last);
10012       /* Create then_bb and if-then structure in CFG, then_bb belongs to
10013          the same loop as if_bb.  It could be different to LOOP when two
10014          level loop-nest is vectorized and mask_store belongs to the inner
10015          one.  */
10016       e = split_block (bb, last);
10017       bb_loop = bb->loop_father;
10018       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
10019       join_bb = e->dest;
10020       store_bb = create_empty_bb (bb);
10021       add_bb_to_loop (store_bb, bb_loop);
10022       e->flags = EDGE_TRUE_VALUE;
10023       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
10024       /* Put STORE_BB to likely part.  */
10025       efalse->probability = profile_probability::unlikely ();
10026       store_bb->count = efalse->count ();
10027       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
10028       if (dom_info_available_p (CDI_DOMINATORS))
10029         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
10030       if (dump_enabled_p ())
10031         dump_printf_loc (MSG_NOTE, vect_location,
10032                          "Create new block %d to sink mask stores.",
10033                          store_bb->index);
10034       /* Create vector comparison with boolean result.  */
10035       vectype = TREE_TYPE (mask);
10036       zero = build_zero_cst (vectype);
10037       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
10038       gsi = gsi_last_bb (bb);
10039       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
10040       /* Create new PHI node for vdef of the last masked store:
10041          .MEM_2 = VDEF <.MEM_1>
10042          will be converted to
10043          .MEM.3 = VDEF <.MEM_1>
10044          and new PHI node will be created in join bb
10045          .MEM_2 = PHI <.MEM_1, .MEM_3>
10046       */
10047       vdef = gimple_vdef (last);
10048       new_vdef = make_ssa_name (gimple_vop (cfun), last);
10049       gimple_set_vdef (last, new_vdef);
10050       phi = create_phi_node (vdef, join_bb);
10051       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
10052
10053       /* Put all masked stores with the same mask to STORE_BB if possible.  */
10054       while (true)
10055         {
10056           gimple_stmt_iterator gsi_from;
10057           gimple *stmt1 = NULL;
10058
10059           /* Move masked store to STORE_BB.  */
10060           last_store = last;
10061           gsi = gsi_for_stmt (last);
10062           gsi_from = gsi;
10063           /* Shift GSI to the previous stmt for further traversal.  */
10064           gsi_prev (&gsi);
10065           gsi_to = gsi_start_bb (store_bb);
10066           gsi_move_before (&gsi_from, &gsi_to);
10067           /* Setup GSI_TO to the non-empty block start.  */
10068           gsi_to = gsi_start_bb (store_bb);
10069           if (dump_enabled_p ())
10070             dump_printf_loc (MSG_NOTE, vect_location,
10071                              "Move stmt to created bb\n%G", last);
10072           /* Move all stored value producers if possible.  */
10073           while (!gsi_end_p (gsi))
10074             {
10075               tree lhs;
10076               imm_use_iterator imm_iter;
10077               use_operand_p use_p;
10078               bool res;
10079
10080               /* Skip debug statements.  */
10081               if (is_gimple_debug (gsi_stmt (gsi)))
10082                 {
10083                   gsi_prev (&gsi);
10084                   continue;
10085                 }
10086               stmt1 = gsi_stmt (gsi);
10087               /* Do not consider statements writing to memory or having
10088                  volatile operand.  */
10089               if (gimple_vdef (stmt1)
10090                   || gimple_has_volatile_ops (stmt1))
10091                 break;
10092               gsi_from = gsi;
10093               gsi_prev (&gsi);
10094               lhs = gimple_get_lhs (stmt1);
10095               if (!lhs)
10096                 break;
10097
10098               /* LHS of vectorized stmt must be SSA_NAME.  */
10099               if (TREE_CODE (lhs) != SSA_NAME)
10100                 break;
10101
10102               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
10103                 {
10104                   /* Remove dead scalar statement.  */
10105                   if (has_zero_uses (lhs))
10106                     {
10107                       gsi_remove (&gsi_from, true);
10108                       continue;
10109                     }
10110                 }
10111
10112               /* Check that LHS does not have uses outside of STORE_BB.  */
10113               res = true;
10114               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
10115                 {
10116                   gimple *use_stmt;
10117                   use_stmt = USE_STMT (use_p);
10118                   if (is_gimple_debug (use_stmt))
10119                     continue;
10120                   if (gimple_bb (use_stmt) != store_bb)
10121                     {
10122                       res = false;
10123                       break;
10124                     }
10125                 }
10126               if (!res)
10127                 break;
10128
10129               if (gimple_vuse (stmt1)
10130                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
10131                 break;
10132
10133               /* Can move STMT1 to STORE_BB.  */
10134               if (dump_enabled_p ())
10135                 dump_printf_loc (MSG_NOTE, vect_location,
10136                                  "Move stmt to created bb\n%G", stmt1);
10137               gsi_move_before (&gsi_from, &gsi_to);
10138               /* Shift GSI_TO for further insertion.  */
10139               gsi_prev (&gsi_to);
10140             }
10141           /* Put other masked stores with the same mask to STORE_BB.  */
10142           if (worklist.is_empty ()
10143               || gimple_call_arg (worklist.last (), 2) != mask
10144               || worklist.last () != stmt1)
10145             break;
10146           last = worklist.pop ();
10147         }
10148       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
10149     }
10150 }
10151
10152 /* Decide whether it is possible to use a zero-based induction variable
10153    when vectorizing LOOP_VINFO with partial vectors.  If it is, return
10154    the value that the induction variable must be able to hold in order
10155    to ensure that the rgroups eventually have no active vector elements.
10156    Return -1 otherwise.  */
10157
10158 widest_int
10159 vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
10160 {
10161   tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
10162   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10163   unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
10164
10165   /* Calculate the value that the induction variable must be able
10166      to hit in order to ensure that we end the loop with an all-false mask.
10167      This involves adding the maximum number of inactive trailing scalar
10168      iterations.  */
10169   widest_int iv_limit = -1;
10170   if (max_loop_iterations (loop, &iv_limit))
10171     {
10172       if (niters_skip)
10173         {
10174           /* Add the maximum number of skipped iterations to the
10175              maximum iteration count.  */
10176           if (TREE_CODE (niters_skip) == INTEGER_CST)
10177             iv_limit += wi::to_widest (niters_skip);
10178           else
10179             iv_limit += max_vf - 1;
10180         }
10181       else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
10182         /* Make a conservatively-correct assumption.  */
10183         iv_limit += max_vf - 1;
10184
10185       /* IV_LIMIT is the maximum number of latch iterations, which is also
10186          the maximum in-range IV value.  Round this value down to the previous
10187          vector alignment boundary and then add an extra full iteration.  */
10188       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
10189       iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
10190     }
10191   return iv_limit;
10192 }
10193
10194 /* For the given rgroup_controls RGC, check whether an induction variable
10195    would ever hit a value that produces a set of all-false masks or zero
10196    lengths before wrapping around.  Return true if it's possible to wrap
10197    around before hitting the desirable value, otherwise return false.  */
10198
10199 bool
10200 vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
10201 {
10202   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
10203
10204   if (iv_limit == -1)
10205     return true;
10206
10207   tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
10208   unsigned int compare_precision = TYPE_PRECISION (compare_type);
10209   unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
10210
10211   if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
10212     return true;
10213
10214   return false;
10215 }