gcc/tree-vect-loop.cc

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2022 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #define INCLUDE_ALGORITHM
  23 #include "config.h"
  24 #include "system.h"
  25 #include "coretypes.h"
  26 #include "backend.h"
  27 #include "target.h"
  28 #include "rtl.h"
  29 #include "tree.h"
  30 #include "gimple.h"
  31 #include "cfghooks.h"
  32 #include "tree-pass.h"
  33 #include "ssa.h"
  34 #include "optabs-tree.h"
  35 #include "diagnostic-core.h"
  36 #include "fold-const.h"
  37 #include "stor-layout.h"
  38 #include "cfganal.h"
  39 #include "gimplify.h"
  40 #include "gimple-iterator.h"
  41 #include "gimplify-me.h"
  42 #include "tree-ssa-loop-ivopts.h"
  43 #include "tree-ssa-loop-manip.h"
  44 #include "tree-ssa-loop-niter.h"
  45 #include "tree-ssa-loop.h"
  46 #include "cfgloop.h"
  47 #include "tree-scalar-evolution.h"
  48 #include "tree-vectorizer.h"
  49 #include "gimple-fold.h"
  50 #include "cgraph.h"
  51 #include "tree-cfg.h"
  52 #include "tree-if-conv.h"
  53 #include "internal-fn.h"
  54 #include "tree-vector-builder.h"
  55 #include "vec-perm-indices.h"
  56 #include "tree-eh.h"
  57 #include "case-cfn-macros.h"
  58
  59 /* Loop Vectorization Pass.
  60
  61    This pass tries to vectorize loops.
  62
  63    For example, the vectorizer transforms the following simple loop:
  64
  65         short a[N]; short b[N]; short c[N]; int i;
  66
  67         for (i=0; i<N; i++){
  68           a[i] = b[i] + c[i];
  69         }
  70
  71    as if it was manually vectorized by rewriting the source code into:
  72
  73         typedef int __attribute__((mode(V8HI))) v8hi;
  74         short a[N];  short b[N]; short c[N];   int i;
  75         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  76         v8hi va, vb, vc;
  77
  78         for (i=0; i<N/8; i++){
  79           vb = pb[i];
  80           vc = pc[i];
  81           va = vb + vc;
  82           pa[i] = va;
  83         }
  84
  85         The main entry to this pass is vectorize_loops(), in which
  86    the vectorizer applies a set of analyses on a given set of loops,
  87    followed by the actual vectorization transformation for the loops that
  88    had successfully passed the analysis phase.
  89         Throughout this pass we make a distinction between two types of
  90    data: scalars (which are represented by SSA_NAMES), and memory references
  91    ("data-refs").  These two types of data require different handling both
  92    during analysis and transformation. The types of data-refs that the
  93    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  94    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  95    accesses are required to have a simple (consecutive) access pattern.
  96
  97    Analysis phase:
  98    ===============
  99         The driver for the analysis phase is vect_analyze_loop().
 100    It applies a set of analyses, some of which rely on the scalar evolution
 101    analyzer (scev) developed by Sebastian Pop.
 102
 103         During the analysis phase the vectorizer records some information
 104    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 105    loop, as well as general information about the loop as a whole, which is
 106    recorded in a "loop_vec_info" struct attached to each loop.
 107
 108    Transformation phase:
 109    =====================
 110         The loop transformation phase scans all the stmts in the loop, and
 111    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 112    the loop that needs to be vectorized.  It inserts the vector code sequence
 113    just before the scalar stmt S, and records a pointer to the vector code
 114    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 115    attached to S).  This pointer will be used for the vectorization of following
 116    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 117    otherwise, we rely on dead code elimination for removing it.
 118
 119         For example, say stmt S1 was vectorized into stmt VS1:
 120
 121    VS1: vb = px[i];
 122    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 123    S2:  a = b;
 124
 125    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 126    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 127    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 128    resulting sequence would be:
 129
 130    VS1: vb = px[i];
 131    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 132    VS2: va = vb;
 133    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 134
 135         Operands that are not SSA_NAMEs, are data-refs that appear in
 136    load/store operations (like 'x[i]' in S1), and are handled differently.
 137
 138    Target modeling:
 139    =================
 140         Currently the only target specific information that is used is the
 141    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 142    Targets that can support different sizes of vectors, for now will need
 143    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 144    flexibility will be added in the future.
 145
 146         Since we only vectorize operations which vector form can be
 147    expressed using existing tree codes, to verify that an operation is
 148    supported, the vectorizer checks the relevant optab at the relevant
 149    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 150    the value found is CODE_FOR_nothing, then there's no target support, and
 151    we can't vectorize the stmt.
 152
 153    For additional information on this project see:
 154    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 155 */
 156
 157 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *,
 158                                                 unsigned *);
 159 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
 160                                                bool *, bool *);
 161
 162 /* Subroutine of vect_determine_vf_for_stmt that handles only one
 163    statement.  VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
 164    may already be set for general statements (not just data refs).  */
 165
 166 static opt_result
 167 vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
 168                               bool vectype_maybe_set_p,
 169                               poly_uint64 *vf)
 170 {
 171   gimple *stmt = stmt_info->stmt;
 172
 173   if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 174        && !STMT_VINFO_LIVE_P (stmt_info))
 175       || gimple_clobber_p (stmt))
 176     {
 177       if (dump_enabled_p ())
 178         dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 179       return opt_result::success ();
 180     }
 181
 182   tree stmt_vectype, nunits_vectype;
 183   opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info,
 184                                                    &stmt_vectype,
 185                                                    &nunits_vectype);
 186   if (!res)
 187     return res;
 188
 189   if (stmt_vectype)
 190     {
 191       if (STMT_VINFO_VECTYPE (stmt_info))
 192         /* The only case when a vectype had been already set is for stmts
 193            that contain a data ref, or for "pattern-stmts" (stmts generated
 194            by the vectorizer to represent/replace a certain idiom).  */
 195         gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
 196                      || vectype_maybe_set_p)
 197                     && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
 198       else
 199         STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
 200     }
 201
 202   if (nunits_vectype)
 203     vect_update_max_nunits (vf, nunits_vectype);
 204
 205   return opt_result::success ();
 206 }
 207
 208 /* Subroutine of vect_determine_vectorization_factor.  Set the vector
 209    types of STMT_INFO and all attached pattern statements and update
 210    the vectorization factor VF accordingly.  Return true on success
 211    or false if something prevented vectorization.  */
 212
 213 static opt_result
 214 vect_determine_vf_for_stmt (vec_info *vinfo,
 215                             stmt_vec_info stmt_info, poly_uint64 *vf)
 216 {
 217   if (dump_enabled_p ())
 218     dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
 219                      stmt_info->stmt);
 220   opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, false, vf);
 221   if (!res)
 222     return res;
 223
 224   if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 225       && STMT_VINFO_RELATED_STMT (stmt_info))
 226     {
 227       gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 228       stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
 229
 230       /* If a pattern statement has def stmts, analyze them too.  */
 231       for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
 232            !gsi_end_p (si); gsi_next (&si))
 233         {
 234           stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
 235           if (dump_enabled_p ())
 236             dump_printf_loc (MSG_NOTE, vect_location,
 237                              "==> examining pattern def stmt: %G",
 238                              def_stmt_info->stmt);
 239           res = vect_determine_vf_for_stmt_1 (vinfo, def_stmt_info, true, vf);
 240           if (!res)
 241             return res;
 242         }
 243
 244       if (dump_enabled_p ())
 245         dump_printf_loc (MSG_NOTE, vect_location,
 246                          "==> examining pattern statement: %G",
 247                          stmt_info->stmt);
 248       res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, true, vf);
 249       if (!res)
 250         return res;
 251     }
 252
 253   return opt_result::success ();
 254 }
 255
 256 /* Function vect_determine_vectorization_factor
 257
 258    Determine the vectorization factor (VF).  VF is the number of data elements
 259    that are operated upon in parallel in a single iteration of the vectorized
 260    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 261    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 262    elements can fit in a single vector register.
 263
 264    We currently support vectorization of loops in which all types operated upon
 265    are of the same size.  Therefore this function currently sets VF according to
 266    the size of the types operated upon, and fails if there are multiple sizes
 267    in the loop.
 268
 269    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 270    original loop:
 271         for (i=0; i<N; i++){
 272           a[i] = b[i] + c[i];
 273         }
 274
 275    vectorized loop:
 276         for (i=0; i<N; i+=VF){
 277           a[i:VF] = b[i:VF] + c[i:VF];
 278         }
 279 */
 280
 281 static opt_result
 282 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 283 {
 284   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 285   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 286   unsigned nbbs = loop->num_nodes;
 287   poly_uint64 vectorization_factor = 1;
 288   tree scalar_type = NULL_TREE;
 289   gphi *phi;
 290   tree vectype;
 291   stmt_vec_info stmt_info;
 292   unsigned i;
 293
 294   DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
 295
 296   for (i = 0; i < nbbs; i++)
 297     {
 298       basic_block bb = bbs[i];
 299
 300       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 301            gsi_next (&si))
 302         {
 303           phi = si.phi ();
 304           stmt_info = loop_vinfo->lookup_stmt (phi);
 305           if (dump_enabled_p ())
 306             dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
 307                              phi);
 308
 309           gcc_assert (stmt_info);
 310
 311           if (STMT_VINFO_RELEVANT_P (stmt_info)
 312               || STMT_VINFO_LIVE_P (stmt_info))
 313             {
 314               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 315               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 316
 317               if (dump_enabled_p ())
 318                 dump_printf_loc (MSG_NOTE, vect_location,
 319                                  "get vectype for scalar type:  %T\n",
 320                                  scalar_type);
 321
 322               vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
 323               if (!vectype)
 324                 return opt_result::failure_at (phi,
 325                                                "not vectorized: unsupported "
 326                                                "data-type %T\n",
 327                                                scalar_type);
 328               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 329
 330               if (dump_enabled_p ())
 331                 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
 332                                  vectype);
 333
 334               if (dump_enabled_p ())
 335                 {
 336                   dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
 337                   dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
 338                   dump_printf (MSG_NOTE, "\n");
 339                 }
 340
 341               vect_update_max_nunits (&vectorization_factor, vectype);
 342             }
 343         }
 344
 345       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
 346            gsi_next (&si))
 347         {
 348           if (is_gimple_debug (gsi_stmt (si)))
 349             continue;
 350           stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
 351           opt_result res
 352             = vect_determine_vf_for_stmt (loop_vinfo,
 353                                           stmt_info, &vectorization_factor);
 354           if (!res)
 355             return res;
 356         }
 357     }
 358
 359   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 360   if (dump_enabled_p ())
 361     {
 362       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
 363       dump_dec (MSG_NOTE, vectorization_factor);
 364       dump_printf (MSG_NOTE, "\n");
 365     }
 366
 367   if (known_le (vectorization_factor, 1U))
 368     return opt_result::failure_at (vect_location,
 369                                    "not vectorized: unsupported data-type\n");
 370   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 371   return opt_result::success ();
 372 }
 373
 374
 375 /* Function vect_is_simple_iv_evolution.
 376
 377    FORNOW: A simple evolution of an induction variables in the loop is
 378    considered a polynomial evolution.  */
 379
 380 static bool
 381 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 382                              tree * step)
 383 {
 384   tree init_expr;
 385   tree step_expr;
 386   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 387   basic_block bb;
 388
 389   /* When there is no evolution in this loop, the evolution function
 390      is not "simple".  */
 391   if (evolution_part == NULL_TREE)
 392     return false;
 393
 394   /* When the evolution is a polynomial of degree >= 2
 395      the evolution function is not "simple".  */
 396   if (tree_is_chrec (evolution_part))
 397     return false;
 398
 399   step_expr = evolution_part;
 400   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 401
 402   if (dump_enabled_p ())
 403     dump_printf_loc (MSG_NOTE, vect_location, "step: %T,  init: %T\n",
 404                      step_expr, init_expr);
 405
 406   *init = init_expr;
 407   *step = step_expr;
 408
 409   if (TREE_CODE (step_expr) != INTEGER_CST
 410       && (TREE_CODE (step_expr) != SSA_NAME
 411           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 412               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 413           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 414               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 415                   || !flag_associative_math)))
 416       && (TREE_CODE (step_expr) != REAL_CST
 417           || !flag_associative_math))
 418     {
 419       if (dump_enabled_p ())
 420         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 421                          "step unknown.\n");
 422       return false;
 423     }
 424
 425   return true;
 426 }
 427
 428 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
 429    what we are assuming is a double reduction.  For example, given
 430    a structure like this:
 431
 432       outer1:
 433         x_1 = PHI <x_4(outer2), ...>;
 434         ...
 435
 436       inner:
 437         x_2 = PHI <x_1(outer1), ...>;
 438         ...
 439         x_3 = ...;
 440         ...
 441
 442       outer2:
 443         x_4 = PHI <x_3(inner)>;
 444         ...
 445
 446    outer loop analysis would treat x_1 as a double reduction phi and
 447    this function would then return true for x_2.  */
 448
 449 static bool
 450 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
 451 {
 452   use_operand_p use_p;
 453   ssa_op_iter op_iter;
 454   FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
 455     if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
 456       if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
 457         return true;
 458   return false;
 459 }
 460
 461 /* Function vect_analyze_scalar_cycles_1.
 462
 463    Examine the cross iteration def-use cycles of scalar variables
 464    in LOOP.  LOOP_VINFO represents the loop that is now being
 465    considered for vectorization (can be LOOP, or an outer-loop
 466    enclosing LOOP).  */
 467
 468 static void
 469 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop)
 470 {
 471   basic_block bb = loop->header;
 472   tree init, step;
 473   auto_vec<stmt_vec_info, 64> worklist;
 474   gphi_iterator gsi;
 475   bool double_reduc, reduc_chain;
 476
 477   DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
 478
 479   /* First - identify all inductions.  Reduction detection assumes that all the
 480      inductions have been identified, therefore, this order must not be
 481      changed.  */
 482   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 483     {
 484       gphi *phi = gsi.phi ();
 485       tree access_fn = NULL;
 486       tree def = PHI_RESULT (phi);
 487       stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
 488
 489       if (dump_enabled_p ())
 490         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
 491
 492       /* Skip virtual phi's.  The data dependences that are associated with
 493          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 494       if (virtual_operand_p (def))
 495         continue;
 496
 497       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 498
 499       /* Analyze the evolution function.  */
 500       access_fn = analyze_scalar_evolution (loop, def);
 501       if (access_fn)
 502         {
 503           STRIP_NOPS (access_fn);
 504           if (dump_enabled_p ())
 505             dump_printf_loc (MSG_NOTE, vect_location,
 506                              "Access function of PHI: %T\n", access_fn);
 507           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 508             = initial_condition_in_loop_num (access_fn, loop->num);
 509           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 510             = evolution_part_in_loop_num (access_fn, loop->num);
 511         }
 512
 513       if (!access_fn
 514           || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
 515           || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
 516           || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 517               && TREE_CODE (step) != INTEGER_CST))
 518         {
 519           worklist.safe_push (stmt_vinfo);
 520           continue;
 521         }
 522
 523       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 524                   != NULL_TREE);
 525       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 526
 527       if (dump_enabled_p ())
 528         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 529       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 530     }
 531
 532
 533   /* Second - identify all reductions and nested cycles.  */
 534   while (worklist.length () > 0)
 535     {
 536       stmt_vec_info stmt_vinfo = worklist.pop ();
 537       gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
 538       tree def = PHI_RESULT (phi);
 539
 540       if (dump_enabled_p ())
 541         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
 542
 543       gcc_assert (!virtual_operand_p (def)
 544                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 545
 546       stmt_vec_info reduc_stmt_info
 547         = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
 548                                     &reduc_chain);
 549       if (reduc_stmt_info)
 550         {
 551           STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
 552           STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
 553           if (double_reduc)
 554             {
 555               if (dump_enabled_p ())
 556                 dump_printf_loc (MSG_NOTE, vect_location,
 557                                  "Detected double reduction.\n");
 558
 559               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 560               STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
 561             }
 562           else
 563             {
 564               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
 565                 {
 566                   if (dump_enabled_p ())
 567                     dump_printf_loc (MSG_NOTE, vect_location,
 568                                      "Detected vectorizable nested cycle.\n");
 569
 570                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 571                 }
 572               else
 573                 {
 574                   if (dump_enabled_p ())
 575                     dump_printf_loc (MSG_NOTE, vect_location,
 576                                      "Detected reduction.\n");
 577
 578                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 579                   STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
 580                   /* Store the reduction cycles for possible vectorization in
 581                      loop-aware SLP if it was not detected as reduction
 582                      chain.  */
 583                   if (! reduc_chain)
 584                     LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
 585                       (reduc_stmt_info);
 586                 }
 587             }
 588         }
 589       else
 590         if (dump_enabled_p ())
 591           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 592                            "Unknown def-use cycle pattern.\n");
 593     }
 594 }
 595
 596
 597 /* Function vect_analyze_scalar_cycles.
 598
 599    Examine the cross iteration def-use cycles of scalar variables, by
 600    analyzing the loop-header PHIs of scalar variables.  Classify each
 601    cycle as one of the following: invariant, induction, reduction, unknown.
 602    We do that for the loop represented by LOOP_VINFO, and also to its
 603    inner-loop, if exists.
 604    Examples for scalar cycles:
 605
 606    Example1: reduction:
 607
 608               loop1:
 609               for (i=0; i<N; i++)
 610                  sum += a[i];
 611
 612    Example2: induction:
 613
 614               loop2:
 615               for (i=0; i<N; i++)
 616                  a[i] = i;  */
 617
 618 static void
 619 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 620 {
 621   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 622
 623   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 624
 625   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 626      Reductions in such inner-loop therefore have different properties than
 627      the reductions in the nest that gets vectorized:
 628      1. When vectorized, they are executed in the same order as in the original
 629         scalar loop, so we can't change the order of computation when
 630         vectorizing them.
 631      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 632         current checks are too strict.  */
 633
 634   if (loop->inner)
 635     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 636 }
 637
 638 /* Transfer group and reduction information from STMT_INFO to its
 639    pattern stmt.  */
 640
 641 static void
 642 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
 643 {
 644   stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
 645   stmt_vec_info stmtp;
 646   gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
 647               && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
 648   REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
 649   do
 650     {
 651       stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
 652       gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
 653                            == STMT_VINFO_DEF_TYPE (stmt_info));
 654       REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
 655       stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
 656       if (stmt_info)
 657         REDUC_GROUP_NEXT_ELEMENT (stmtp)
 658           = STMT_VINFO_RELATED_STMT (stmt_info);
 659     }
 660   while (stmt_info);
 661 }
 662
 663 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 664
 665 static void
 666 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 667 {
 668   stmt_vec_info first;
 669   unsigned i;
 670
 671   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 672     {
 673       stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
 674       while (next)
 675         {
 676           if ((STMT_VINFO_IN_PATTERN_P (next)
 677                != STMT_VINFO_IN_PATTERN_P (first))
 678               || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next)) == -1)
 679             break;
 680           next = REDUC_GROUP_NEXT_ELEMENT (next);
 681         }
 682       /* If all reduction chain members are well-formed patterns adjust
 683          the group to group the pattern stmts instead.  */
 684       if (! next
 685           && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first)) != -1)
 686         {
 687           if (STMT_VINFO_IN_PATTERN_P (first))
 688             {
 689               vect_fixup_reduc_chain (first);
 690               LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
 691                 = STMT_VINFO_RELATED_STMT (first);
 692             }
 693         }
 694       /* If not all stmt in the chain are patterns or if we failed
 695          to update STMT_VINFO_REDUC_IDX dissolve the chain and handle
 696          it as regular reduction instead.  */
 697       else
 698         {
 699           stmt_vec_info vinfo = first;
 700           stmt_vec_info last = NULL;
 701           while (vinfo)
 702             {
 703               next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
 704               REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
 705               REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
 706               last = vinfo;
 707               vinfo = next;
 708             }
 709           STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first))
 710             = vect_internal_def;
 711           loop_vinfo->reductions.safe_push (vect_stmt_to_vectorize (last));
 712           LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).unordered_remove (i);
 713           --i;
 714         }
 715     }
 716 }
 717
 718 /* Function vect_get_loop_niters.
 719
 720    Determine how many iterations the loop is executed and place it
 721    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
 722    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
 723    niter information holds in ASSUMPTIONS.
 724
 725    Return the loop exit condition.  */
 726
 727
 728 static gcond *
 729 vect_get_loop_niters (class loop *loop, tree *assumptions,
 730                       tree *number_of_iterations, tree *number_of_iterationsm1)
 731 {
 732   edge exit = single_exit (loop);
 733   class tree_niter_desc niter_desc;
 734   tree niter_assumptions, niter, may_be_zero;
 735   gcond *cond = get_loop_exit_condition (loop);
 736
 737   *assumptions = boolean_true_node;
 738   *number_of_iterationsm1 = chrec_dont_know;
 739   *number_of_iterations = chrec_dont_know;
 740   DUMP_VECT_SCOPE ("get_loop_niters");
 741
 742   if (!exit)
 743     return cond;
 744
 745   may_be_zero = NULL_TREE;
 746   if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
 747       || chrec_contains_undetermined (niter_desc.niter))
 748     return cond;
 749
 750   niter_assumptions = niter_desc.assumptions;
 751   may_be_zero = niter_desc.may_be_zero;
 752   niter = niter_desc.niter;
 753
 754   if (may_be_zero && integer_zerop (may_be_zero))
 755     may_be_zero = NULL_TREE;
 756
 757   if (may_be_zero)
 758     {
 759       if (COMPARISON_CLASS_P (may_be_zero))
 760         {
 761           /* Try to combine may_be_zero with assumptions, this can simplify
 762              computation of niter expression.  */
 763           if (niter_assumptions && !integer_nonzerop (niter_assumptions))
 764             niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
 765                                              niter_assumptions,
 766                                              fold_build1 (TRUTH_NOT_EXPR,
 767                                                           boolean_type_node,
 768                                                           may_be_zero));
 769           else
 770             niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
 771                                  build_int_cst (TREE_TYPE (niter), 0),
 772                                  rewrite_to_non_trapping_overflow (niter));
 773
 774           may_be_zero = NULL_TREE;
 775         }
 776       else if (integer_nonzerop (may_be_zero))
 777         {
 778           *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
 779           *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
 780           return cond;
 781         }
 782       else
 783         return cond;
 784     }
 785
 786   *assumptions = niter_assumptions;
 787   *number_of_iterationsm1 = niter;
 788
 789   /* We want the number of loop header executions which is the number
 790      of latch executions plus one.
 791      ???  For UINT_MAX latch executions this number overflows to zero
 792      for loops like do { n++; } while (n != 0);  */
 793   if (niter && !chrec_contains_undetermined (niter))
 794     niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
 795                           build_int_cst (TREE_TYPE (niter), 1));
 796   *number_of_iterations = niter;
 797
 798   return cond;
 799 }
 800
 801 /* Function bb_in_loop_p
 802
 803    Used as predicate for dfs order traversal of the loop bbs.  */
 804
 805 static bool
 806 bb_in_loop_p (const_basic_block bb, const void *data)
 807 {
 808   const class loop *const loop = (const class loop *)data;
 809   if (flow_bb_inside_loop_p (loop, bb))
 810     return true;
 811   return false;
 812 }
 813
 814
 815 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
 816    stmt_vec_info structs for all the stmts in LOOP_IN.  */
 817
 818 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
 819   : vec_info (vec_info::loop, shared),
 820     loop (loop_in),
 821     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
 822     num_itersm1 (NULL_TREE),
 823     num_iters (NULL_TREE),
 824     num_iters_unchanged (NULL_TREE),
 825     num_iters_assumptions (NULL_TREE),
 826     vector_costs (nullptr),
 827     scalar_costs (nullptr),
 828     th (0),
 829     versioning_threshold (0),
 830     vectorization_factor (0),
 831     main_loop_edge (nullptr),
 832     skip_main_loop_edge (nullptr),
 833     skip_this_loop_edge (nullptr),
 834     reusable_accumulators (),
 835     suggested_unroll_factor (1),
 836     max_vectorization_factor (0),
 837     mask_skip_niters (NULL_TREE),
 838     rgroup_compare_type (NULL_TREE),
 839     simd_if_cond (NULL_TREE),
 840     unaligned_dr (NULL),
 841     peeling_for_alignment (0),
 842     ptr_mask (0),
 843     ivexpr_map (NULL),
 844     scan_map (NULL),
 845     slp_unrolling_factor (1),
 846     inner_loop_cost_factor (param_vect_inner_loop_cost_factor),
 847     vectorizable (false),
 848     can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
 849     using_partial_vectors_p (false),
 850     epil_using_partial_vectors_p (false),
 851     partial_load_store_bias (0),
 852     peeling_for_gaps (false),
 853     peeling_for_niter (false),
 854     no_data_dependencies (false),
 855     has_mask_store (false),
 856     scalar_loop_scaling (profile_probability::uninitialized ()),
 857     scalar_loop (NULL),
 858     orig_loop_info (NULL)
 859 {
 860   /* CHECKME: We want to visit all BBs before their successors (except for
 861      latch blocks, for which this assertion wouldn't hold).  In the simple
 862      case of the loop forms we allow, a dfs order of the BBs would the same
 863      as reversed postorder traversal, so we are safe.  */
 864
 865   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 866                                           bbs, loop->num_nodes, loop);
 867   gcc_assert (nbbs == loop->num_nodes);
 868
 869   for (unsigned int i = 0; i < nbbs; i++)
 870     {
 871       basic_block bb = bbs[i];
 872       gimple_stmt_iterator si;
 873
 874       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 875         {
 876           gimple *phi = gsi_stmt (si);
 877           gimple_set_uid (phi, 0);
 878           add_stmt (phi);
 879         }
 880
 881       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 882         {
 883           gimple *stmt = gsi_stmt (si);
 884           gimple_set_uid (stmt, 0);
 885           if (is_gimple_debug (stmt))
 886             continue;
 887           add_stmt (stmt);
 888           /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
 889              third argument is the #pragma omp simd if (x) condition, when 0,
 890              loop shouldn't be vectorized, when non-zero constant, it should
 891              be vectorized normally, otherwise versioned with vectorized loop
 892              done if the condition is non-zero at runtime.  */
 893           if (loop_in->simduid
 894               && is_gimple_call (stmt)
 895               && gimple_call_internal_p (stmt)
 896               && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
 897               && gimple_call_num_args (stmt) >= 3
 898               && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
 899               && (loop_in->simduid
 900                   == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
 901             {
 902               tree arg = gimple_call_arg (stmt, 2);
 903               if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
 904                 simd_if_cond = arg;
 905               else
 906                 gcc_assert (integer_nonzerop (arg));
 907             }
 908         }
 909     }
 910
 911   epilogue_vinfos.create (6);
 912 }
 913
 914 /* Free all levels of rgroup CONTROLS.  */
 915
 916 void
 917 release_vec_loop_controls (vec<rgroup_controls> *controls)
 918 {
 919   rgroup_controls *rgc;
 920   unsigned int i;
 921   FOR_EACH_VEC_ELT (*controls, i, rgc)
 922     rgc->controls.release ();
 923   controls->release ();
 924 }
 925
 926 /* Free all memory used by the _loop_vec_info, as well as all the
 927    stmt_vec_info structs of all the stmts in the loop.  */
 928
 929 _loop_vec_info::~_loop_vec_info ()
 930 {
 931   free (bbs);
 932
 933   release_vec_loop_controls (&masks);
 934   release_vec_loop_controls (&lens);
 935   delete ivexpr_map;
 936   delete scan_map;
 937   epilogue_vinfos.release ();
 938   delete scalar_costs;
 939   delete vector_costs;
 940
 941   /* When we release an epiloge vinfo that we do not intend to use
 942      avoid clearing AUX of the main loop which should continue to
 943      point to the main loop vinfo since otherwise we'll leak that.  */
 944   if (loop->aux == this)
 945     loop->aux = NULL;
 946 }
 947
 948 /* Return an invariant or register for EXPR and emit necessary
 949    computations in the LOOP_VINFO loop preheader.  */
 950
 951 tree
 952 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
 953 {
 954   if (is_gimple_reg (expr)
 955       || is_gimple_min_invariant (expr))
 956     return expr;
 957
 958   if (! loop_vinfo->ivexpr_map)
 959     loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
 960   tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
 961   if (! cached)
 962     {
 963       gimple_seq stmts = NULL;
 964       cached = force_gimple_operand (unshare_expr (expr),
 965                                      &stmts, true, NULL_TREE);
 966       if (stmts)
 967         {
 968           edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
 969           gsi_insert_seq_on_edge_immediate (e, stmts);
 970         }
 971     }
 972   return cached;
 973 }
 974
 975 /* Return true if we can use CMP_TYPE as the comparison type to produce
 976    all masks required to mask LOOP_VINFO.  */
 977
 978 static bool
 979 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
 980 {
 981   rgroup_controls *rgm;
 982   unsigned int i;
 983   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
 984     if (rgm->type != NULL_TREE
 985         && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
 986                                             cmp_type, rgm->type,
 987                                             OPTIMIZE_FOR_SPEED))
 988       return false;
 989   return true;
 990 }
 991
 992 /* Calculate the maximum number of scalars per iteration for every
 993    rgroup in LOOP_VINFO.  */
 994
 995 static unsigned int
 996 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
 997 {
 998   unsigned int res = 1;
 999   unsigned int i;
1000   rgroup_controls *rgm;
1001   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1002     res = MAX (res, rgm->max_nscalars_per_iter);
1003   return res;
1004 }
1005
1006 /* Calculate the minimum precision necessary to represent:
1007
1008       MAX_NITERS * FACTOR
1009
1010    as an unsigned integer, where MAX_NITERS is the maximum number of
1011    loop header iterations for the original scalar form of LOOP_VINFO.  */
1012
1013 static unsigned
1014 vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
1015 {
1016   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1017
1018   /* Get the maximum number of iterations that is representable
1019      in the counter type.  */
1020   tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1021   widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1022
1023   /* Get a more refined estimate for the number of iterations.  */
1024   widest_int max_back_edges;
1025   if (max_loop_iterations (loop, &max_back_edges))
1026     max_ni = wi::smin (max_ni, max_back_edges + 1);
1027
1028   /* Work out how many bits we need to represent the limit.  */
1029   return wi::min_precision (max_ni * factor, UNSIGNED);
1030 }
1031
1032 /* True if the loop needs peeling or partial vectors when vectorized.  */
1033
1034 static bool
1035 vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
1036 {
1037   unsigned HOST_WIDE_INT const_vf;
1038   HOST_WIDE_INT max_niter
1039     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1040
1041   unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1042   if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1043     th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1044                                           (loop_vinfo));
1045
1046   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1047       && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1048     {
1049       /* Work out the (constant) number of iterations that need to be
1050          peeled for reasons other than niters.  */
1051       unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1052       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1053         peel_niter += 1;
1054       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1055                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1056         return true;
1057     }
1058   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1059       /* ??? When peeling for gaps but not alignment, we could
1060          try to check whether the (variable) niters is known to be
1061          VF * N + 1.  That's something of a niche case though.  */
1062       || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1063       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1064       || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1065            < (unsigned) exact_log2 (const_vf))
1066           /* In case of versioning, check if the maximum number of
1067              iterations is greater than th.  If they are identical,
1068              the epilogue is unnecessary.  */
1069           && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1070               || ((unsigned HOST_WIDE_INT) max_niter
1071                   > (th / const_vf) * const_vf))))
1072     return true;
1073
1074   return false;
1075 }
1076
1077 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1078    whether we can actually generate the masks required.  Return true if so,
1079    storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE.  */
1080
1081 static bool
1082 vect_verify_full_masking (loop_vec_info loop_vinfo)
1083 {
1084   unsigned int min_ni_width;
1085   unsigned int max_nscalars_per_iter
1086     = vect_get_max_nscalars_per_iter (loop_vinfo);
1087
1088   /* Use a normal loop if there are no statements that need masking.
1089      This only happens in rare degenerate cases: it means that the loop
1090      has no loads, no stores, and no live-out values.  */
1091   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1092     return false;
1093
1094   /* Work out how many bits we need to represent the limit.  */
1095   min_ni_width
1096     = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1097
1098   /* Find a scalar mode for which WHILE_ULT is supported.  */
1099   opt_scalar_int_mode cmp_mode_iter;
1100   tree cmp_type = NULL_TREE;
1101   tree iv_type = NULL_TREE;
1102   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1103   unsigned int iv_precision = UINT_MAX;
1104
1105   if (iv_limit != -1)
1106     iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1107                                       UNSIGNED);
1108
1109   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1110     {
1111       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1112       if (cmp_bits >= min_ni_width
1113           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1114         {
1115           tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1116           if (this_type
1117               && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1118             {
1119               /* Although we could stop as soon as we find a valid mode,
1120                  there are at least two reasons why that's not always the
1121                  best choice:
1122
1123                  - An IV that's Pmode or wider is more likely to be reusable
1124                    in address calculations than an IV that's narrower than
1125                    Pmode.
1126
1127                  - Doing the comparison in IV_PRECISION or wider allows
1128                    a natural 0-based IV, whereas using a narrower comparison
1129                    type requires mitigations against wrap-around.
1130
1131                  Conversely, if the IV limit is variable, doing the comparison
1132                  in a wider type than the original type can introduce
1133                  unnecessary extensions, so picking the widest valid mode
1134                  is not always a good choice either.
1135
1136                  Here we prefer the first IV type that's Pmode or wider,
1137                  and the first comparison type that's IV_PRECISION or wider.
1138                  (The comparison type must be no wider than the IV type,
1139                  to avoid extensions in the vector loop.)
1140
1141                  ??? We might want to try continuing beyond Pmode for ILP32
1142                  targets if CMP_BITS < IV_PRECISION.  */
1143               iv_type = this_type;
1144               if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1145                 cmp_type = this_type;
1146               if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1147                 break;
1148             }
1149         }
1150     }
1151
1152   if (!cmp_type)
1153     return false;
1154
1155   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1156   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1157   return true;
1158 }
1159
1160 /* Check whether we can use vector access with length based on precison
1161    comparison.  So far, to keep it simple, we only allow the case that the
1162    precision of the target supported length is larger than the precision
1163    required by loop niters.  */
1164
1165 static bool
1166 vect_verify_loop_lens (loop_vec_info loop_vinfo)
1167 {
1168   if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1169     return false;
1170
1171   machine_mode len_load_mode = get_len_load_store_mode
1172     (loop_vinfo->vector_mode, true).require ();
1173   machine_mode len_store_mode = get_len_load_store_mode
1174     (loop_vinfo->vector_mode, false).require ();
1175
1176   signed char partial_load_bias = internal_len_load_store_bias
1177     (IFN_LEN_LOAD, len_load_mode);
1178
1179   signed char partial_store_bias = internal_len_load_store_bias
1180     (IFN_LEN_STORE, len_store_mode);
1181
1182   gcc_assert (partial_load_bias == partial_store_bias);
1183
1184   if (partial_load_bias == VECT_PARTIAL_BIAS_UNSUPPORTED)
1185     return false;
1186
1187   /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit
1188      len_loads with a length of zero.  In order to avoid that we prohibit
1189      more than one loop length here.  */
1190   if (partial_load_bias == -1
1191       && LOOP_VINFO_LENS (loop_vinfo).length () > 1)
1192     return false;
1193
1194   LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) = partial_load_bias;
1195
1196   unsigned int max_nitems_per_iter = 1;
1197   unsigned int i;
1198   rgroup_controls *rgl;
1199   /* Find the maximum number of items per iteration for every rgroup.  */
1200   FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1201     {
1202       unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1203       max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1204     }
1205
1206   /* Work out how many bits we need to represent the length limit.  */
1207   unsigned int min_ni_prec
1208     = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1209
1210   /* Now use the maximum of below precisions for one suitable IV type:
1211      - the IV's natural precision
1212      - the precision needed to hold: the maximum number of scalar
1213        iterations multiplied by the scale factor (min_ni_prec above)
1214      - the Pmode precision
1215
1216      If min_ni_prec is less than the precision of the current niters,
1217      we perfer to still use the niters type.  Prefer to use Pmode and
1218      wider IV to avoid narrow conversions.  */
1219
1220   unsigned int ni_prec
1221     = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1222   min_ni_prec = MAX (min_ni_prec, ni_prec);
1223   min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1224
1225   tree iv_type = NULL_TREE;
1226   opt_scalar_int_mode tmode_iter;
1227   FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1228     {
1229       scalar_mode tmode = tmode_iter.require ();
1230       unsigned int tbits = GET_MODE_BITSIZE (tmode);
1231
1232       /* ??? Do we really want to construct one IV whose precision exceeds
1233          BITS_PER_WORD?  */
1234       if (tbits > BITS_PER_WORD)
1235         break;
1236
1237       /* Find the first available standard integral type.  */
1238       if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1239         {
1240           iv_type = build_nonstandard_integer_type (tbits, true);
1241           break;
1242         }
1243     }
1244
1245   if (!iv_type)
1246     {
1247       if (dump_enabled_p ())
1248         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1249                          "can't vectorize with length-based partial vectors"
1250                          " because there is no suitable iv type.\n");
1251       return false;
1252     }
1253
1254   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1255   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1256
1257   return true;
1258 }
1259
1260 /* Calculate the cost of one scalar iteration of the loop.  */
1261 static void
1262 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1263 {
1264   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1265   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1266   int nbbs = loop->num_nodes, factor;
1267   int innerloop_iters, i;
1268
1269   DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1270
1271   /* Gather costs for statements in the scalar loop.  */
1272
1273   /* FORNOW.  */
1274   innerloop_iters = 1;
1275   if (loop->inner)
1276     innerloop_iters = LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo);
1277
1278   for (i = 0; i < nbbs; i++)
1279     {
1280       gimple_stmt_iterator si;
1281       basic_block bb = bbs[i];
1282
1283       if (bb->loop_father == loop->inner)
1284         factor = innerloop_iters;
1285       else
1286         factor = 1;
1287
1288       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1289         {
1290           gimple *stmt = gsi_stmt (si);
1291           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1292
1293           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1294             continue;
1295
1296           /* Skip stmts that are not vectorized inside the loop.  */
1297           stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1298           if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1299               && (!STMT_VINFO_LIVE_P (vstmt_info)
1300                   || !VECTORIZABLE_CYCLE_DEF
1301                         (STMT_VINFO_DEF_TYPE (vstmt_info))))
1302             continue;
1303
1304           vect_cost_for_stmt kind;
1305           if (STMT_VINFO_DATA_REF (stmt_info))
1306             {
1307               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1308                kind = scalar_load;
1309              else
1310                kind = scalar_store;
1311             }
1312           else if (vect_nop_conversion_p (stmt_info))
1313             continue;
1314           else
1315             kind = scalar_stmt;
1316
1317           /* We are using vect_prologue here to avoid scaling twice
1318              by the inner loop factor.  */
1319           record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1320                             factor, kind, stmt_info, 0, vect_prologue);
1321         }
1322     }
1323
1324   /* Now accumulate cost.  */
1325   loop_vinfo->scalar_costs = init_cost (loop_vinfo, true);
1326   add_stmt_costs (loop_vinfo->scalar_costs,
1327                   &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo));
1328   loop_vinfo->scalar_costs->finish_cost (nullptr);
1329 }
1330
1331
1332 /* Function vect_analyze_loop_form.
1333
1334    Verify that certain CFG restrictions hold, including:
1335    - the loop has a pre-header
1336    - the loop has a single entry and exit
1337    - the loop exit condition is simple enough
1338    - the number of iterations can be analyzed, i.e, a countable loop.  The
1339      niter could be analyzed under some assumptions.  */
1340
1341 opt_result
1342 vect_analyze_loop_form (class loop *loop, vect_loop_form_info *info)
1343 {
1344   DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1345
1346   /* Different restrictions apply when we are considering an inner-most loop,
1347      vs. an outer (nested) loop.
1348      (FORNOW. May want to relax some of these restrictions in the future).  */
1349
1350   info->inner_loop_cond = NULL;
1351   if (!loop->inner)
1352     {
1353       /* Inner-most loop.  We currently require that the number of BBs is
1354          exactly 2 (the header and latch).  Vectorizable inner-most loops
1355          look like this:
1356
1357                         (pre-header)
1358                            |
1359                           header <--------+
1360                            | |            |
1361                            | +--> latch --+
1362                            |
1363                         (exit-bb)  */
1364
1365       if (loop->num_nodes != 2)
1366         return opt_result::failure_at (vect_location,
1367                                        "not vectorized:"
1368                                        " control flow in loop.\n");
1369
1370       if (empty_block_p (loop->header))
1371         return opt_result::failure_at (vect_location,
1372                                        "not vectorized: empty loop.\n");
1373     }
1374   else
1375     {
1376       class loop *innerloop = loop->inner;
1377       edge entryedge;
1378
1379       /* Nested loop. We currently require that the loop is doubly-nested,
1380          contains a single inner loop, and the number of BBs is exactly 5.
1381          Vectorizable outer-loops look like this:
1382
1383                         (pre-header)
1384                            |
1385                           header <---+
1386                            |         |
1387                           inner-loop |
1388                            |         |
1389                           tail ------+
1390                            |
1391                         (exit-bb)
1392
1393          The inner-loop has the properties expected of inner-most loops
1394          as described above.  */
1395
1396       if ((loop->inner)->inner || (loop->inner)->next)
1397         return opt_result::failure_at (vect_location,
1398                                        "not vectorized:"
1399                                        " multiple nested loops.\n");
1400
1401       if (loop->num_nodes != 5)
1402         return opt_result::failure_at (vect_location,
1403                                        "not vectorized:"
1404                                        " control flow in loop.\n");
1405
1406       entryedge = loop_preheader_edge (innerloop);
1407       if (entryedge->src != loop->header
1408           || !single_exit (innerloop)
1409           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1410         return opt_result::failure_at (vect_location,
1411                                        "not vectorized:"
1412                                        " unsupported outerloop form.\n");
1413
1414       /* Analyze the inner-loop.  */
1415       vect_loop_form_info inner;
1416       opt_result res = vect_analyze_loop_form (loop->inner, &inner);
1417       if (!res)
1418         {
1419           if (dump_enabled_p ())
1420             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1421                              "not vectorized: Bad inner loop.\n");
1422           return res;
1423         }
1424
1425       /* Don't support analyzing niter under assumptions for inner
1426          loop.  */
1427       if (!integer_onep (inner.assumptions))
1428         return opt_result::failure_at (vect_location,
1429                                        "not vectorized: Bad inner loop.\n");
1430
1431       if (!expr_invariant_in_loop_p (loop, inner.number_of_iterations))
1432         return opt_result::failure_at (vect_location,
1433                                        "not vectorized: inner-loop count not"
1434                                        " invariant.\n");
1435
1436       if (dump_enabled_p ())
1437         dump_printf_loc (MSG_NOTE, vect_location,
1438                          "Considering outer-loop vectorization.\n");
1439       info->inner_loop_cond = inner.loop_cond;
1440     }
1441
1442   if (!single_exit (loop))
1443     return opt_result::failure_at (vect_location,
1444                                    "not vectorized: multiple exits.\n");
1445   if (EDGE_COUNT (loop->header->preds) != 2)
1446     return opt_result::failure_at (vect_location,
1447                                    "not vectorized:"
1448                                    " too many incoming edges.\n");
1449
1450   /* We assume that the loop exit condition is at the end of the loop. i.e,
1451      that the loop is represented as a do-while (with a proper if-guard
1452      before the loop if needed), where the loop header contains all the
1453      executable statements, and the latch is empty.  */
1454   if (!empty_block_p (loop->latch)
1455       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1456     return opt_result::failure_at (vect_location,
1457                                    "not vectorized: latch block not empty.\n");
1458
1459   /* Make sure the exit is not abnormal.  */
1460   edge e = single_exit (loop);
1461   if (e->flags & EDGE_ABNORMAL)
1462     return opt_result::failure_at (vect_location,
1463                                    "not vectorized:"
1464                                    " abnormal loop exit edge.\n");
1465
1466   info->loop_cond
1467     = vect_get_loop_niters (loop, &info->assumptions,
1468                             &info->number_of_iterations,
1469                             &info->number_of_iterationsm1);
1470   if (!info->loop_cond)
1471     return opt_result::failure_at
1472       (vect_location,
1473        "not vectorized: complicated exit condition.\n");
1474
1475   if (integer_zerop (info->assumptions)
1476       || !info->number_of_iterations
1477       || chrec_contains_undetermined (info->number_of_iterations))
1478     return opt_result::failure_at
1479       (info->loop_cond,
1480        "not vectorized: number of iterations cannot be computed.\n");
1481
1482   if (integer_zerop (info->number_of_iterations))
1483     return opt_result::failure_at
1484       (info->loop_cond,
1485        "not vectorized: number of iterations = 0.\n");
1486
1487   if (!(tree_fits_shwi_p (info->number_of_iterations)
1488         && tree_to_shwi (info->number_of_iterations) > 0))
1489     {
1490       if (dump_enabled_p ())
1491         {
1492           dump_printf_loc (MSG_NOTE, vect_location,
1493                            "Symbolic number of iterations is ");
1494           dump_generic_expr (MSG_NOTE, TDF_DETAILS, info->number_of_iterations);
1495           dump_printf (MSG_NOTE, "\n");
1496         }
1497     }
1498
1499   return opt_result::success ();
1500 }
1501
1502 /* Create a loop_vec_info for LOOP with SHARED and the
1503    vect_analyze_loop_form result.  */
1504
1505 loop_vec_info
1506 vect_create_loop_vinfo (class loop *loop, vec_info_shared *shared,
1507                         const vect_loop_form_info *info,
1508                         loop_vec_info main_loop_info)
1509 {
1510   loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1511   LOOP_VINFO_NITERSM1 (loop_vinfo) = info->number_of_iterationsm1;
1512   LOOP_VINFO_NITERS (loop_vinfo) = info->number_of_iterations;
1513   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = info->number_of_iterations;
1514   LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = main_loop_info;
1515   /* Also record the assumptions for versioning.  */
1516   if (!integer_onep (info->assumptions) && !main_loop_info)
1517     LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = info->assumptions;
1518
1519   stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (info->loop_cond);
1520   STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1521   if (info->inner_loop_cond)
1522     {
1523       stmt_vec_info inner_loop_cond_info
1524         = loop_vinfo->lookup_stmt (info->inner_loop_cond);
1525       STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1526       /* If we have an estimate on the number of iterations of the inner
1527          loop use that to limit the scale for costing, otherwise use
1528          --param vect-inner-loop-cost-factor literally.  */
1529       widest_int nit;
1530       if (estimated_stmt_executions (loop->inner, &nit))
1531         LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo)
1532           = wi::smin (nit, param_vect_inner_loop_cost_factor).to_uhwi ();
1533     }
1534
1535   return loop_vinfo;
1536 }
1537
1538
1539
1540 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1541    statements update the vectorization factor.  */
1542
1543 static void
1544 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1545 {
1546   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1547   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1548   int nbbs = loop->num_nodes;
1549   poly_uint64 vectorization_factor;
1550   int i;
1551
1552   DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1553
1554   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1555   gcc_assert (known_ne (vectorization_factor, 0U));
1556
1557   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1558      vectorization factor of the loop is the unrolling factor required by
1559      the SLP instances.  If that unrolling factor is 1, we say, that we
1560      perform pure SLP on loop - cross iteration parallelism is not
1561      exploited.  */
1562   bool only_slp_in_loop = true;
1563   for (i = 0; i < nbbs; i++)
1564     {
1565       basic_block bb = bbs[i];
1566       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1567            gsi_next (&si))
1568         {
1569           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1570           if (!stmt_info)
1571             continue;
1572           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1573                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1574               && !PURE_SLP_STMT (stmt_info))
1575             /* STMT needs both SLP and loop-based vectorization.  */
1576             only_slp_in_loop = false;
1577         }
1578       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1579            gsi_next (&si))
1580         {
1581           if (is_gimple_debug (gsi_stmt (si)))
1582             continue;
1583           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1584           stmt_info = vect_stmt_to_vectorize (stmt_info);
1585           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1586                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1587               && !PURE_SLP_STMT (stmt_info))
1588             /* STMT needs both SLP and loop-based vectorization.  */
1589             only_slp_in_loop = false;
1590         }
1591     }
1592
1593   if (only_slp_in_loop)
1594     {
1595       if (dump_enabled_p ())
1596         dump_printf_loc (MSG_NOTE, vect_location,
1597                          "Loop contains only SLP stmts\n");
1598       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1599     }
1600   else
1601     {
1602       if (dump_enabled_p ())
1603         dump_printf_loc (MSG_NOTE, vect_location,
1604                          "Loop contains SLP and non-SLP stmts\n");
1605       /* Both the vectorization factor and unroll factor have the form
1606          GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
1607          so they must have a common multiple.  */
1608       vectorization_factor
1609         = force_common_multiple (vectorization_factor,
1610                                  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1611     }
1612
1613   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1614   if (dump_enabled_p ())
1615     {
1616       dump_printf_loc (MSG_NOTE, vect_location,
1617                        "Updating vectorization factor to ");
1618       dump_dec (MSG_NOTE, vectorization_factor);
1619       dump_printf (MSG_NOTE, ".\n");
1620     }
1621 }
1622
1623 /* Return true if STMT_INFO describes a double reduction phi and if
1624    the other phi in the reduction is also relevant for vectorization.
1625    This rejects cases such as:
1626
1627       outer1:
1628         x_1 = PHI <x_3(outer2), ...>;
1629         ...
1630
1631       inner:
1632         x_2 = ...;
1633         ...
1634
1635       outer2:
1636         x_3 = PHI <x_2(inner)>;
1637
1638    if nothing in x_2 or elsewhere makes x_1 relevant.  */
1639
1640 static bool
1641 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1642 {
1643   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1644     return false;
1645
1646   return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1647 }
1648
1649 /* Function vect_analyze_loop_operations.
1650
1651    Scan the loop stmts and make sure they are all vectorizable.  */
1652
1653 static opt_result
1654 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1655 {
1656   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1657   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1658   int nbbs = loop->num_nodes;
1659   int i;
1660   stmt_vec_info stmt_info;
1661   bool need_to_vectorize = false;
1662   bool ok;
1663
1664   DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1665
1666   auto_vec<stmt_info_for_cost> cost_vec;
1667
1668   for (i = 0; i < nbbs; i++)
1669     {
1670       basic_block bb = bbs[i];
1671
1672       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1673            gsi_next (&si))
1674         {
1675           gphi *phi = si.phi ();
1676           ok = true;
1677
1678           stmt_info = loop_vinfo->lookup_stmt (phi);
1679           if (dump_enabled_p ())
1680             dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G", phi);
1681           if (virtual_operand_p (gimple_phi_result (phi)))
1682             continue;
1683
1684           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1685              (i.e., a phi in the tail of the outer-loop).  */
1686           if (! is_loop_header_bb_p (bb))
1687             {
1688               /* FORNOW: we currently don't support the case that these phis
1689                  are not used in the outerloop (unless it is double reduction,
1690                  i.e., this phi is vect_reduction_def), cause this case
1691                  requires to actually do something here.  */
1692               if (STMT_VINFO_LIVE_P (stmt_info)
1693                   && !vect_active_double_reduction_p (stmt_info))
1694                 return opt_result::failure_at (phi,
1695                                                "Unsupported loop-closed phi"
1696                                                " in outer-loop.\n");
1697
1698               /* If PHI is used in the outer loop, we check that its operand
1699                  is defined in the inner loop.  */
1700               if (STMT_VINFO_RELEVANT_P (stmt_info))
1701                 {
1702                   tree phi_op;
1703
1704                   if (gimple_phi_num_args (phi) != 1)
1705                     return opt_result::failure_at (phi, "unsupported phi");
1706
1707                   phi_op = PHI_ARG_DEF (phi, 0);
1708                   stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1709                   if (!op_def_info)
1710                     return opt_result::failure_at (phi, "unsupported phi\n");
1711
1712                   if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1713                       && (STMT_VINFO_RELEVANT (op_def_info)
1714                           != vect_used_in_outer_by_reduction))
1715                     return opt_result::failure_at (phi, "unsupported phi\n");
1716
1717                   if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
1718                        || (STMT_VINFO_DEF_TYPE (stmt_info)
1719                            == vect_double_reduction_def))
1720                       && !vectorizable_lc_phi (loop_vinfo,
1721                                                stmt_info, NULL, NULL))
1722                     return opt_result::failure_at (phi, "unsupported phi\n");
1723                 }
1724
1725               continue;
1726             }
1727
1728           gcc_assert (stmt_info);
1729
1730           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1731                || STMT_VINFO_LIVE_P (stmt_info))
1732               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1733             /* A scalar-dependence cycle that we don't support.  */
1734             return opt_result::failure_at (phi,
1735                                            "not vectorized:"
1736                                            " scalar dependence cycle.\n");
1737
1738           if (STMT_VINFO_RELEVANT_P (stmt_info))
1739             {
1740               need_to_vectorize = true;
1741               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1742                   && ! PURE_SLP_STMT (stmt_info))
1743                 ok = vectorizable_induction (loop_vinfo,
1744                                              stmt_info, NULL, NULL,
1745                                              &cost_vec);
1746               else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1747                         || (STMT_VINFO_DEF_TYPE (stmt_info)
1748                             == vect_double_reduction_def)
1749                         || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1750                        && ! PURE_SLP_STMT (stmt_info))
1751                 ok = vectorizable_reduction (loop_vinfo,
1752                                              stmt_info, NULL, NULL, &cost_vec);
1753             }
1754
1755           /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
1756           if (ok
1757               && STMT_VINFO_LIVE_P (stmt_info)
1758               && !PURE_SLP_STMT (stmt_info))
1759             ok = vectorizable_live_operation (loop_vinfo,
1760                                               stmt_info, NULL, NULL, NULL,
1761                                               -1, false, &cost_vec);
1762
1763           if (!ok)
1764             return opt_result::failure_at (phi,
1765                                            "not vectorized: relevant phi not "
1766                                            "supported: %G",
1767                                            static_cast <gimple *> (phi));
1768         }
1769
1770       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1771            gsi_next (&si))
1772         {
1773           gimple *stmt = gsi_stmt (si);
1774           if (!gimple_clobber_p (stmt)
1775               && !is_gimple_debug (stmt))
1776             {
1777               opt_result res
1778                 = vect_analyze_stmt (loop_vinfo,
1779                                      loop_vinfo->lookup_stmt (stmt),
1780                                      &need_to_vectorize,
1781                                      NULL, NULL, &cost_vec);
1782               if (!res)
1783                 return res;
1784             }
1785         }
1786     } /* bbs */
1787
1788   add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
1789
1790   /* All operations in the loop are either irrelevant (deal with loop
1791      control, or dead), or only used outside the loop and can be moved
1792      out of the loop (e.g. invariants, inductions).  The loop can be
1793      optimized away by scalar optimizations.  We're better off not
1794      touching this loop.  */
1795   if (!need_to_vectorize)
1796     {
1797       if (dump_enabled_p ())
1798         dump_printf_loc (MSG_NOTE, vect_location,
1799                          "All the computation can be taken out of the loop.\n");
1800       return opt_result::failure_at
1801         (vect_location,
1802          "not vectorized: redundant loop. no profit to vectorize.\n");
1803     }
1804
1805   return opt_result::success ();
1806 }
1807
1808 /* Return true if we know that the iteration count is smaller than the
1809    vectorization factor.  Return false if it isn't, or if we can't be sure
1810    either way.  */
1811
1812 static bool
1813 vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
1814 {
1815   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1816
1817   HOST_WIDE_INT max_niter;
1818   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1819     max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1820   else
1821     max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1822
1823   if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1824     return true;
1825
1826   return false;
1827 }
1828
1829 /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
1830    is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
1831    definitely no, or -1 if it's worth retrying.  */
1832
1833 static int
1834 vect_analyze_loop_costing (loop_vec_info loop_vinfo,
1835                            unsigned *suggested_unroll_factor)
1836 {
1837   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1838   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1839
1840   /* Only loops that can handle partially-populated vectors can have iteration
1841      counts less than the vectorization factor.  */
1842   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
1843     {
1844       if (vect_known_niters_smaller_than_vf (loop_vinfo))
1845         {
1846           if (dump_enabled_p ())
1847             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1848                              "not vectorized: iteration count smaller than "
1849                              "vectorization factor.\n");
1850           return 0;
1851         }
1852     }
1853
1854   /* If using the "very cheap" model. reject cases in which we'd keep
1855      a copy of the scalar code (even if we might be able to vectorize it).  */
1856   if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
1857       && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1858           || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1859           || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)))
1860     {
1861       if (dump_enabled_p ())
1862         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1863                          "some scalar iterations would need to be peeled\n");
1864       return 0;
1865     }
1866
1867   int min_profitable_iters, min_profitable_estimate;
1868   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1869                                       &min_profitable_estimate,
1870                                       suggested_unroll_factor);
1871
1872   if (min_profitable_iters < 0)
1873     {
1874       if (dump_enabled_p ())
1875         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1876                          "not vectorized: vectorization not profitable.\n");
1877       if (dump_enabled_p ())
1878         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1879                          "not vectorized: vector version will never be "
1880                          "profitable.\n");
1881       return -1;
1882     }
1883
1884   int min_scalar_loop_bound = (param_min_vect_loop_bound
1885                                * assumed_vf);
1886
1887   /* Use the cost model only if it is more conservative than user specified
1888      threshold.  */
1889   unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1890                                     min_profitable_iters);
1891
1892   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1893
1894   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1895       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1896     {
1897       if (dump_enabled_p ())
1898         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1899                          "not vectorized: vectorization not profitable.\n");
1900       if (dump_enabled_p ())
1901         dump_printf_loc (MSG_NOTE, vect_location,
1902                          "not vectorized: iteration count smaller than user "
1903                          "specified loop bound parameter or minimum profitable "
1904                          "iterations (whichever is more conservative).\n");
1905       return 0;
1906     }
1907
1908   /* The static profitablity threshold min_profitable_estimate includes
1909      the cost of having to check at runtime whether the scalar loop
1910      should be used instead.  If it turns out that we don't need or want
1911      such a check, the threshold we should use for the static estimate
1912      is simply the point at which the vector loop becomes more profitable
1913      than the scalar loop.  */
1914   if (min_profitable_estimate > min_profitable_iters
1915       && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
1916       && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
1917       && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1918       && !vect_apply_runtime_profitability_check_p (loop_vinfo))
1919     {
1920       if (dump_enabled_p ())
1921         dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
1922                          " choice between the scalar and vector loops\n");
1923       min_profitable_estimate = min_profitable_iters;
1924     }
1925
1926   /* If the vector loop needs multiple iterations to be beneficial then
1927      things are probably too close to call, and the conservative thing
1928      would be to stick with the scalar code.  */
1929   if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
1930       && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
1931     {
1932       if (dump_enabled_p ())
1933         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1934                          "one iteration of the vector loop would be"
1935                          " more expensive than the equivalent number of"
1936                          " iterations of the scalar loop\n");
1937       return 0;
1938     }
1939
1940   HOST_WIDE_INT estimated_niter;
1941
1942   /* If we are vectorizing an epilogue then we know the maximum number of
1943      scalar iterations it will cover is at least one lower than the
1944      vectorization factor of the main loop.  */
1945   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1946     estimated_niter
1947       = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
1948   else
1949     {
1950       estimated_niter = estimated_stmt_executions_int (loop);
1951       if (estimated_niter == -1)
1952         estimated_niter = likely_max_stmt_executions_int (loop);
1953     }
1954   if (estimated_niter != -1
1955       && ((unsigned HOST_WIDE_INT) estimated_niter
1956           < MAX (th, (unsigned) min_profitable_estimate)))
1957     {
1958       if (dump_enabled_p ())
1959         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1960                          "not vectorized: estimated iteration count too "
1961                          "small.\n");
1962       if (dump_enabled_p ())
1963         dump_printf_loc (MSG_NOTE, vect_location,
1964                          "not vectorized: estimated iteration count smaller "
1965                          "than specified loop bound parameter or minimum "
1966                          "profitable iterations (whichever is more "
1967                          "conservative).\n");
1968       return -1;
1969     }
1970
1971   return 1;
1972 }
1973
1974 static opt_result
1975 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1976                            vec<data_reference_p> *datarefs,
1977                            unsigned int *n_stmts)
1978 {
1979   *n_stmts = 0;
1980   for (unsigned i = 0; i < loop->num_nodes; i++)
1981     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1982          !gsi_end_p (gsi); gsi_next (&gsi))
1983       {
1984         gimple *stmt = gsi_stmt (gsi);
1985         if (is_gimple_debug (stmt))
1986           continue;
1987         ++(*n_stmts);
1988         opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
1989                                                         NULL, 0);
1990         if (!res)
1991           {
1992             if (is_gimple_call (stmt) && loop->safelen)
1993               {
1994                 tree fndecl = gimple_call_fndecl (stmt), op;
1995                 if (fndecl != NULL_TREE)
1996                   {
1997                     cgraph_node *node = cgraph_node::get (fndecl);
1998                     if (node != NULL && node->simd_clones != NULL)
1999                       {
2000                         unsigned int j, n = gimple_call_num_args (stmt);
2001                         for (j = 0; j < n; j++)
2002                           {
2003                             op = gimple_call_arg (stmt, j);
2004                             if (DECL_P (op)
2005                                 || (REFERENCE_CLASS_P (op)
2006                                     && get_base_address (op)))
2007                               break;
2008                           }
2009                         op = gimple_call_lhs (stmt);
2010                         /* Ignore #pragma omp declare simd functions
2011                            if they don't have data references in the
2012                            call stmt itself.  */
2013                         if (j == n
2014                             && !(op
2015                                  && (DECL_P (op)
2016                                      || (REFERENCE_CLASS_P (op)
2017                                          && get_base_address (op)))))
2018                           continue;
2019                       }
2020                   }
2021               }
2022             return res;
2023           }
2024         /* If dependence analysis will give up due to the limit on the
2025            number of datarefs stop here and fail fatally.  */
2026         if (datarefs->length ()
2027             > (unsigned)param_loop_max_datarefs_for_datadeps)
2028           return opt_result::failure_at (stmt, "exceeded param "
2029                                          "loop-max-datarefs-for-datadeps\n");
2030       }
2031   return opt_result::success ();
2032 }
2033
2034 /* Look for SLP-only access groups and turn each individual access into its own
2035    group.  */
2036 static void
2037 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
2038 {
2039   unsigned int i;
2040   struct data_reference *dr;
2041
2042   DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
2043
2044   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2045   FOR_EACH_VEC_ELT (datarefs, i, dr)
2046     {
2047       gcc_assert (DR_REF (dr));
2048       stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
2049
2050       /* Check if the load is a part of an interleaving chain.  */
2051       if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2052         {
2053           stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
2054           dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_element);
2055           unsigned int group_size = DR_GROUP_SIZE (first_element);
2056
2057           /* Check if SLP-only groups.  */
2058           if (!STMT_SLP_TYPE (stmt_info)
2059               && STMT_VINFO_SLP_VECT_ONLY (first_element))
2060             {
2061               /* Dissolve the group.  */
2062               STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
2063
2064               stmt_vec_info vinfo = first_element;
2065               while (vinfo)
2066                 {
2067                   stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
2068                   DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
2069                   DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2070                   DR_GROUP_SIZE (vinfo) = 1;
2071                   if (STMT_VINFO_STRIDED_P (first_element))
2072                     DR_GROUP_GAP (vinfo) = 0;
2073                   else
2074                     DR_GROUP_GAP (vinfo) = group_size - 1;
2075                   /* Duplicate and adjust alignment info, it needs to
2076                      be present on each group leader, see dr_misalignment.  */
2077                   if (vinfo != first_element)
2078                     {
2079                       dr_vec_info *dr_info2 = STMT_VINFO_DR_INFO (vinfo);
2080                       dr_info2->target_alignment = dr_info->target_alignment;
2081                       int misalignment = dr_info->misalignment;
2082                       if (misalignment != DR_MISALIGNMENT_UNKNOWN)
2083                         {
2084                           HOST_WIDE_INT diff
2085                             = (TREE_INT_CST_LOW (DR_INIT (dr_info2->dr))
2086                                - TREE_INT_CST_LOW (DR_INIT (dr_info->dr)));
2087                           unsigned HOST_WIDE_INT align_c
2088                             = dr_info->target_alignment.to_constant ();
2089                           misalignment = (misalignment + diff) % align_c;
2090                         }
2091                       dr_info2->misalignment = misalignment;
2092                     }
2093                   vinfo = next;
2094                 }
2095             }
2096         }
2097     }
2098 }
2099
2100 /* Determine if operating on full vectors for LOOP_VINFO might leave
2101    some scalar iterations still to do.  If so, decide how we should
2102    handle those scalar iterations.  The possibilities are:
2103
2104    (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
2105        In this case:
2106
2107          LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2108          LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2109          LOOP_VINFO_PEELING_FOR_NITER == false
2110
2111    (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2112        to handle the remaining scalar iterations.  In this case:
2113
2114          LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2115          LOOP_VINFO_PEELING_FOR_NITER == true
2116
2117        There are two choices:
2118
2119        (2a) Consider vectorizing the epilogue loop at the same VF as the
2120             main loop, but using partial vectors instead of full vectors.
2121             In this case:
2122
2123               LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true
2124
2125        (2b) Consider vectorizing the epilogue loop at lower VFs only.
2126             In this case:
2127
2128               LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2129
2130    When FOR_EPILOGUE_P is true, make this determination based on the
2131    assumption that LOOP_VINFO is an epilogue loop, otherwise make it
2132    based on the assumption that LOOP_VINFO is the main loop.  The caller
2133    has made sure that the number of iterations is set appropriately for
2134    this value of FOR_EPILOGUE_P.  */
2135
2136 opt_result
2137 vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo,
2138                                             bool for_epilogue_p)
2139 {
2140   /* Determine whether there would be any scalar iterations left over.  */
2141   bool need_peeling_or_partial_vectors_p
2142     = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
2143
2144   /* Decide whether to vectorize the loop with partial vectors.  */
2145   LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2146   LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2147   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2148       && need_peeling_or_partial_vectors_p)
2149     {
2150       /* For partial-vector-usage=1, try to push the handling of partial
2151          vectors to the epilogue, with the main loop continuing to operate
2152          on full vectors.
2153
2154          If we are unrolling we also do not want to use partial vectors. This
2155          is to avoid the overhead of generating multiple masks and also to
2156          avoid having to execute entire iterations of FALSE masked instructions
2157          when dealing with one or less full iterations.
2158
2159          ??? We could then end up failing to use partial vectors if we
2160          decide to peel iterations into a prologue, and if the main loop
2161          then ends up processing fewer than VF iterations.  */
2162       if ((param_vect_partial_vector_usage == 1
2163            || loop_vinfo->suggested_unroll_factor > 1)
2164           && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2165           && !vect_known_niters_smaller_than_vf (loop_vinfo))
2166         LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2167       else
2168         LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2169     }
2170
2171   if (dump_enabled_p ())
2172     {
2173       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2174         dump_printf_loc (MSG_NOTE, vect_location,
2175                          "operating on partial vectors%s.\n",
2176                          for_epilogue_p ? " for epilogue loop" : "");
2177       else
2178         dump_printf_loc (MSG_NOTE, vect_location,
2179                          "operating only on full vectors%s.\n",
2180                          for_epilogue_p ? " for epilogue loop" : "");
2181     }
2182
2183   if (for_epilogue_p)
2184     {
2185       loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2186       gcc_assert (orig_loop_vinfo);
2187       if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2188         gcc_assert (known_lt (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2189                               LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)));
2190     }
2191
2192   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2193       && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2194     {
2195       /* Check that the loop processes at least one full vector.  */
2196       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2197       tree scalar_niters = LOOP_VINFO_NITERS (loop_vinfo);
2198       if (known_lt (wi::to_widest (scalar_niters), vf))
2199         return opt_result::failure_at (vect_location,
2200                                        "loop does not have enough iterations"
2201                                        " to support vectorization.\n");
2202
2203       /* If we need to peel an extra epilogue iteration to handle data
2204          accesses with gaps, check that there are enough scalar iterations
2205          available.
2206
2207          The check above is redundant with this one when peeling for gaps,
2208          but the distinction is useful for diagnostics.  */
2209       tree scalar_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
2210       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2211           && known_lt (wi::to_widest (scalar_nitersm1), vf))
2212         return opt_result::failure_at (vect_location,
2213                                        "loop does not have enough iterations"
2214                                        " to support peeling for gaps.\n");
2215     }
2216
2217   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2218     = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2219        && need_peeling_or_partial_vectors_p);
2220
2221   return opt_result::success ();
2222 }
2223
2224 /* Function vect_analyze_loop_2.
2225
2226    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2227    for it.  The different analyses will record information in the
2228    loop_vec_info struct.  */
2229 static opt_result
2230 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal,
2231                      unsigned *suggested_unroll_factor)
2232 {
2233   opt_result ok = opt_result::success ();
2234   int res;
2235   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2236   poly_uint64 min_vf = 2;
2237   loop_vec_info orig_loop_vinfo = NULL;
2238
2239   /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2240      loop_vec_info of the first vectorized loop.  */
2241   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2242     orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2243   else
2244     orig_loop_vinfo = loop_vinfo;
2245   gcc_assert (orig_loop_vinfo);
2246
2247   /* The first group of checks is independent of the vector size.  */
2248   fatal = true;
2249
2250   if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2251       && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2252     return opt_result::failure_at (vect_location,
2253                                    "not vectorized: simd if(0)\n");
2254
2255   /* Find all data references in the loop (which correspond to vdefs/vuses)
2256      and analyze their evolution in the loop.  */
2257
2258   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2259
2260   /* Gather the data references and count stmts in the loop.  */
2261   if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2262     {
2263       opt_result res
2264         = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2265                                      &LOOP_VINFO_DATAREFS (loop_vinfo),
2266                                      &LOOP_VINFO_N_STMTS (loop_vinfo));
2267       if (!res)
2268         {
2269           if (dump_enabled_p ())
2270             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2271                              "not vectorized: loop contains function "
2272                              "calls or data references that cannot "
2273                              "be analyzed\n");
2274           return res;
2275         }
2276       loop_vinfo->shared->save_datarefs ();
2277     }
2278   else
2279     loop_vinfo->shared->check_datarefs ();
2280
2281   /* Analyze the data references and also adjust the minimal
2282      vectorization factor according to the loads and stores.  */
2283
2284   ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
2285   if (!ok)
2286     {
2287       if (dump_enabled_p ())
2288         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2289                          "bad data references.\n");
2290       return ok;
2291     }
2292
2293   /* Classify all cross-iteration scalar data-flow cycles.
2294      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
2295   vect_analyze_scalar_cycles (loop_vinfo);
2296
2297   vect_pattern_recog (loop_vinfo);
2298
2299   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2300
2301   /* Analyze the access patterns of the data-refs in the loop (consecutive,
2302      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
2303
2304   ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
2305   if (!ok)
2306     {
2307       if (dump_enabled_p ())
2308         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2309                          "bad data access.\n");
2310       return ok;
2311     }
2312
2313   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
2314
2315   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2316   if (!ok)
2317     {
2318       if (dump_enabled_p ())
2319         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2320                          "unexpected pattern.\n");
2321       return ok;
2322     }
2323
2324   /* While the rest of the analysis below depends on it in some way.  */
2325   fatal = false;
2326
2327   /* Analyze data dependences between the data-refs in the loop
2328      and adjust the maximum vectorization factor according to
2329      the dependences.
2330      FORNOW: fail at the first data dependence that we encounter.  */
2331
2332   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2333   if (!ok)
2334     {
2335       if (dump_enabled_p ())
2336         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2337                          "bad data dependence.\n");
2338       return ok;
2339     }
2340   if (max_vf != MAX_VECTORIZATION_FACTOR
2341       && maybe_lt (max_vf, min_vf))
2342     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2343   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2344
2345   ok = vect_determine_vectorization_factor (loop_vinfo);
2346   if (!ok)
2347     {
2348       if (dump_enabled_p ())
2349         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2350                          "can't determine vectorization factor.\n");
2351       return ok;
2352     }
2353   if (max_vf != MAX_VECTORIZATION_FACTOR
2354       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2355     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2356
2357   /* Compute the scalar iteration cost.  */
2358   vect_compute_single_scalar_iteration_cost (loop_vinfo);
2359
2360   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2361
2362   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
2363   ok = vect_analyze_slp (loop_vinfo, LOOP_VINFO_N_STMTS (loop_vinfo));
2364   if (!ok)
2365     return ok;
2366
2367   /* If there are any SLP instances mark them as pure_slp.  */
2368   bool slp = vect_make_slp_decision (loop_vinfo);
2369   if (slp)
2370     {
2371       /* Find stmts that need to be both vectorized and SLPed.  */
2372       vect_detect_hybrid_slp (loop_vinfo);
2373
2374       /* Update the vectorization factor based on the SLP decision.  */
2375       vect_update_vf_for_slp (loop_vinfo);
2376
2377       /* Optimize the SLP graph with the vectorization factor fixed.  */
2378       vect_optimize_slp (loop_vinfo);
2379
2380       /* Gather the loads reachable from the SLP graph entries.  */
2381       vect_gather_slp_loads (loop_vinfo);
2382     }
2383
2384   bool saved_can_use_partial_vectors_p
2385     = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2386
2387   /* We don't expect to have to roll back to anything other than an empty
2388      set of rgroups.  */
2389   gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2390
2391   /* Apply the suggested unrolling factor, this was determined by the backend
2392      during finish_cost the first time we ran the analyzis for this
2393      vector mode.  */
2394   if (loop_vinfo->suggested_unroll_factor > 1)
2395     LOOP_VINFO_VECT_FACTOR (loop_vinfo) *= loop_vinfo->suggested_unroll_factor;
2396
2397   /* This is the point where we can re-start analysis with SLP forced off.  */
2398 start_over:
2399
2400   /* Now the vectorization factor is final.  */
2401   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2402   gcc_assert (known_ne (vectorization_factor, 0U));
2403
2404   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2405     {
2406       dump_printf_loc (MSG_NOTE, vect_location,
2407                        "vectorization_factor = ");
2408       dump_dec (MSG_NOTE, vectorization_factor);
2409       dump_printf (MSG_NOTE, ", niters = %wd\n",
2410                    LOOP_VINFO_INT_NITERS (loop_vinfo));
2411     }
2412
2413   loop_vinfo->vector_costs = init_cost (loop_vinfo, false);
2414
2415   /* Analyze the alignment of the data-refs in the loop.
2416      Fail if a data reference is found that cannot be vectorized.  */
2417
2418   ok = vect_analyze_data_refs_alignment (loop_vinfo);
2419   if (!ok)
2420     {
2421       if (dump_enabled_p ())
2422         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2423                          "bad data alignment.\n");
2424       return ok;
2425     }
2426
2427   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2428      It is important to call pruning after vect_analyze_data_ref_accesses,
2429      since we use grouping information gathered by interleaving analysis.  */
2430   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2431   if (!ok)
2432     return ok;
2433
2434   /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2435      vectorization, since we do not want to add extra peeling or
2436      add versioning for alignment.  */
2437   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2438     /* This pass will decide on using loop versioning and/or loop peeling in
2439        order to enhance the alignment of data references in the loop.  */
2440     ok = vect_enhance_data_refs_alignment (loop_vinfo);
2441   if (!ok)
2442     return ok;
2443
2444   if (slp)
2445     {
2446       /* Analyze operations in the SLP instances.  Note this may
2447          remove unsupported SLP instances which makes the above
2448          SLP kind detection invalid.  */
2449       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2450       vect_slp_analyze_operations (loop_vinfo);
2451       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2452         {
2453           ok = opt_result::failure_at (vect_location,
2454                                        "unsupported SLP instances\n");
2455           goto again;
2456         }
2457
2458       /* Check whether any load in ALL SLP instances is possibly permuted.  */
2459       slp_tree load_node, slp_root;
2460       unsigned i, x;
2461       slp_instance instance;
2462       bool can_use_lanes = true;
2463       FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), x, instance)
2464         {
2465           slp_root = SLP_INSTANCE_TREE (instance);
2466           int group_size = SLP_TREE_LANES (slp_root);
2467           tree vectype = SLP_TREE_VECTYPE (slp_root);
2468           bool loads_permuted = false;
2469           FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2470             {
2471               if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
2472                 continue;
2473               unsigned j;
2474               stmt_vec_info load_info;
2475               FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2476                 if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
2477                   {
2478                     loads_permuted = true;
2479                     break;
2480                   }
2481             }
2482
2483           /* If the loads and stores can be handled with load/store-lane
2484              instructions record it and move on to the next instance.  */
2485           if (loads_permuted
2486               && SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2487               && vect_store_lanes_supported (vectype, group_size, false))
2488             {
2489               FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2490                 {
2491                   stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2492                       (SLP_TREE_SCALAR_STMTS (load_node)[0]);
2493                   /* Use SLP for strided accesses (or if we can't
2494                      load-lanes).  */
2495                   if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2496                       || ! vect_load_lanes_supported
2497                             (STMT_VINFO_VECTYPE (stmt_vinfo),
2498                              DR_GROUP_SIZE (stmt_vinfo), false))
2499                     break;
2500                 }
2501
2502               can_use_lanes
2503                 = can_use_lanes && i == SLP_INSTANCE_LOADS (instance).length ();
2504
2505               if (can_use_lanes && dump_enabled_p ())
2506                 dump_printf_loc (MSG_NOTE, vect_location,
2507                                  "SLP instance %p can use load/store-lanes\n",
2508                                  instance);
2509             }
2510           else
2511             {
2512               can_use_lanes = false;
2513               break;
2514             }
2515         }
2516
2517       /* If all SLP instances can use load/store-lanes abort SLP and try again
2518          with SLP disabled.  */
2519       if (can_use_lanes)
2520         {
2521           ok = opt_result::failure_at (vect_location,
2522                                        "Built SLP cancelled: can use "
2523                                        "load/store-lanes\n");
2524           if (dump_enabled_p ())
2525             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2526                              "Built SLP cancelled: all SLP instances support "
2527                              "load/store-lanes\n");
2528           goto again;
2529         }
2530     }
2531
2532   /* Dissolve SLP-only groups.  */
2533   vect_dissolve_slp_only_groups (loop_vinfo);
2534
2535   /* Scan all the remaining operations in the loop that are not subject
2536      to SLP and make sure they are vectorizable.  */
2537   ok = vect_analyze_loop_operations (loop_vinfo);
2538   if (!ok)
2539     {
2540       if (dump_enabled_p ())
2541         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2542                          "bad operation or unsupported loop bound.\n");
2543       return ok;
2544     }
2545
2546   /* For now, we don't expect to mix both masking and length approaches for one
2547      loop, disable it if both are recorded.  */
2548   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2549       && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
2550       && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
2551     {
2552       if (dump_enabled_p ())
2553         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2554                          "can't vectorize a loop with partial vectors"
2555                          " because we don't expect to mix different"
2556                          " approaches with partial vectors for the"
2557                          " same loop.\n");
2558       LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2559     }
2560
2561   /* If we still have the option of using partial vectors,
2562      check whether we can generate the necessary loop controls.  */
2563   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2564       && !vect_verify_full_masking (loop_vinfo)
2565       && !vect_verify_loop_lens (loop_vinfo))
2566     LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2567
2568   /* If we're vectorizing an epilogue loop, the vectorized loop either needs
2569      to be able to handle fewer than VF scalars, or needs to have a lower VF
2570      than the main loop.  */
2571   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2572       && !LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2573       && maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2574                    LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)))
2575     return opt_result::failure_at (vect_location,
2576                                    "Vectorization factor too high for"
2577                                    " epilogue loop.\n");
2578
2579   /* Decide whether this loop_vinfo should use partial vectors or peeling,
2580      assuming that the loop will be used as a main loop.  We will redo
2581      this analysis later if we instead decide to use the loop as an
2582      epilogue loop.  */
2583   ok = vect_determine_partial_vectors_and_peeling (loop_vinfo, false);
2584   if (!ok)
2585     return ok;
2586
2587   /* Check the costings of the loop make vectorizing worthwhile.  */
2588   res = vect_analyze_loop_costing (loop_vinfo, suggested_unroll_factor);
2589   if (res < 0)
2590     {
2591       ok = opt_result::failure_at (vect_location,
2592                                    "Loop costings may not be worthwhile.\n");
2593       goto again;
2594     }
2595   if (!res)
2596     return opt_result::failure_at (vect_location,
2597                                    "Loop costings not worthwhile.\n");
2598
2599   /* If an epilogue loop is required make sure we can create one.  */
2600   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2601       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2602     {
2603       if (dump_enabled_p ())
2604         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2605       if (!vect_can_advance_ivs_p (loop_vinfo)
2606           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2607                                            single_exit (LOOP_VINFO_LOOP
2608                                                          (loop_vinfo))))
2609         {
2610           ok = opt_result::failure_at (vect_location,
2611                                        "not vectorized: can't create required "
2612                                        "epilog loop\n");
2613           goto again;
2614         }
2615     }
2616
2617   /* During peeling, we need to check if number of loop iterations is
2618      enough for both peeled prolog loop and vector loop.  This check
2619      can be merged along with threshold check of loop versioning, so
2620      increase threshold for this case if necessary.
2621
2622      If we are analyzing an epilogue we still want to check what its
2623      versioning threshold would be.  If we decide to vectorize the epilogues we
2624      will want to use the lowest versioning threshold of all epilogues and main
2625      loop.  This will enable us to enter a vectorized epilogue even when
2626      versioning the loop.  We can't simply check whether the epilogue requires
2627      versioning though since we may have skipped some versioning checks when
2628      analyzing the epilogue.  For instance, checks for alias versioning will be
2629      skipped when dealing with epilogues as we assume we already checked them
2630      for the main loop.  So instead we always check the 'orig_loop_vinfo'.  */
2631   if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
2632     {
2633       poly_uint64 niters_th = 0;
2634       unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2635
2636       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2637         {
2638           /* Niters for peeled prolog loop.  */
2639           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2640             {
2641               dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2642               tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2643               niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2644             }
2645           else
2646             niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2647         }
2648
2649       /* Niters for at least one iteration of vectorized loop.  */
2650       if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2651         niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2652       /* One additional iteration because of peeling for gap.  */
2653       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2654         niters_th += 1;
2655
2656       /*  Use the same condition as vect_transform_loop to decide when to use
2657           the cost to determine a versioning threshold.  */
2658       if (vect_apply_runtime_profitability_check_p (loop_vinfo)
2659           && ordered_p (th, niters_th))
2660         niters_th = ordered_max (poly_uint64 (th), niters_th);
2661
2662       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2663     }
2664
2665   gcc_assert (known_eq (vectorization_factor,
2666                         LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2667
2668   /* Ok to vectorize!  */
2669   LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2670   return opt_result::success ();
2671
2672 again:
2673   /* Ensure that "ok" is false (with an opt_problem if dumping is enabled).  */
2674   gcc_assert (!ok);
2675
2676   /* Try again with SLP forced off but if we didn't do any SLP there is
2677      no point in re-trying.  */
2678   if (!slp)
2679     return ok;
2680
2681   /* If there are reduction chains re-trying will fail anyway.  */
2682   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2683     return ok;
2684
2685   /* Likewise if the grouped loads or stores in the SLP cannot be handled
2686      via interleaving or lane instructions.  */
2687   slp_instance instance;
2688   slp_tree node;
2689   unsigned i, j;
2690   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2691     {
2692       stmt_vec_info vinfo;
2693       vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2694       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2695         continue;
2696       vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2697       unsigned int size = DR_GROUP_SIZE (vinfo);
2698       tree vectype = STMT_VINFO_VECTYPE (vinfo);
2699       if (! vect_store_lanes_supported (vectype, size, false)
2700          && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2701          && ! vect_grouped_store_supported (vectype, size))
2702         return opt_result::failure_at (vinfo->stmt,
2703                                        "unsupported grouped store\n");
2704       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2705         {
2706           vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
2707           vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2708           bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2709           size = DR_GROUP_SIZE (vinfo);
2710           vectype = STMT_VINFO_VECTYPE (vinfo);
2711           if (! vect_load_lanes_supported (vectype, size, false)
2712               && ! vect_grouped_load_supported (vectype, single_element_p,
2713                                                 size))
2714             return opt_result::failure_at (vinfo->stmt,
2715                                            "unsupported grouped load\n");
2716         }
2717     }
2718
2719   if (dump_enabled_p ())
2720     dump_printf_loc (MSG_NOTE, vect_location,
2721                      "re-trying with SLP disabled\n");
2722
2723   /* Roll back state appropriately.  No SLP this time.  */
2724   slp = false;
2725   /* Restore vectorization factor as it were without SLP.  */
2726   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2727   /* Free the SLP instances.  */
2728   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2729     vect_free_slp_instance (instance);
2730   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2731   /* Reset SLP type to loop_vect on all stmts.  */
2732   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2733     {
2734       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2735       for (gimple_stmt_iterator si = gsi_start_phis (bb);
2736            !gsi_end_p (si); gsi_next (&si))
2737         {
2738           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2739           STMT_SLP_TYPE (stmt_info) = loop_vect;
2740           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2741               || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
2742             {
2743               /* vectorizable_reduction adjusts reduction stmt def-types,
2744                  restore them to that of the PHI.  */
2745               STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
2746                 = STMT_VINFO_DEF_TYPE (stmt_info);
2747               STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
2748                                         (STMT_VINFO_REDUC_DEF (stmt_info)))
2749                 = STMT_VINFO_DEF_TYPE (stmt_info);
2750             }
2751         }
2752       for (gimple_stmt_iterator si = gsi_start_bb (bb);
2753            !gsi_end_p (si); gsi_next (&si))
2754         {
2755           if (is_gimple_debug (gsi_stmt (si)))
2756             continue;
2757           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2758           STMT_SLP_TYPE (stmt_info) = loop_vect;
2759           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2760             {
2761               stmt_vec_info pattern_stmt_info
2762                 = STMT_VINFO_RELATED_STMT (stmt_info);
2763               if (STMT_VINFO_SLP_VECT_ONLY_PATTERN (pattern_stmt_info))
2764                 STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
2765
2766               gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2767               STMT_SLP_TYPE (pattern_stmt_info) = loop_vect;
2768               for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2769                    !gsi_end_p (pi); gsi_next (&pi))
2770                 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2771                   = loop_vect;
2772             }
2773         }
2774     }
2775   /* Free optimized alias test DDRS.  */
2776   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2777   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2778   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2779   /* Reset target cost data.  */
2780   delete loop_vinfo->vector_costs;
2781   loop_vinfo->vector_costs = nullptr;
2782   /* Reset accumulated rgroup information.  */
2783   release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo));
2784   release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
2785   /* Reset assorted flags.  */
2786   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2787   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2788   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2789   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2790   LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2791     = saved_can_use_partial_vectors_p;
2792
2793   goto start_over;
2794 }
2795
2796 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
2797    to be better than vectorizing it using OLD_LOOP_VINFO.  Assume that
2798    OLD_LOOP_VINFO is better unless something specifically indicates
2799    otherwise.
2800
2801    Note that this deliberately isn't a partial order.  */
2802
2803 static bool
2804 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
2805                           loop_vec_info old_loop_vinfo)
2806 {
2807   struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
2808   gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
2809
2810   poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
2811   poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
2812
2813   /* Always prefer a VF of loop->simdlen over any other VF.  */
2814   if (loop->simdlen)
2815     {
2816       bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
2817       bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
2818       if (new_simdlen_p != old_simdlen_p)
2819         return new_simdlen_p;
2820     }
2821
2822   const auto *old_costs = old_loop_vinfo->vector_costs;
2823   const auto *new_costs = new_loop_vinfo->vector_costs;
2824   if (loop_vec_info main_loop = LOOP_VINFO_ORIG_LOOP_INFO (old_loop_vinfo))
2825     return new_costs->better_epilogue_loop_than_p (old_costs, main_loop);
2826
2827   return new_costs->better_main_loop_than_p (old_costs);
2828 }
2829
2830 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO.  Return
2831    true if we should.  */
2832
2833 static bool
2834 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
2835                         loop_vec_info old_loop_vinfo)
2836 {
2837   if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
2838     return false;
2839
2840   if (dump_enabled_p ())
2841     dump_printf_loc (MSG_NOTE, vect_location,
2842                      "***** Preferring vector mode %s to vector mode %s\n",
2843                      GET_MODE_NAME (new_loop_vinfo->vector_mode),
2844                      GET_MODE_NAME (old_loop_vinfo->vector_mode));
2845   return true;
2846 }
2847
2848 /* Analyze LOOP with VECTOR_MODES[MODE_I] and as epilogue if MAIN_LOOP_VINFO is
2849    not NULL.  Set AUTODETECTED_VECTOR_MODE if VOIDmode and advance
2850    MODE_I to the next mode useful to analyze.
2851    Return the loop_vinfo on success and wrapped null on failure.  */
2852
2853 static opt_loop_vec_info
2854 vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared,
2855                      const vect_loop_form_info *loop_form_info,
2856                      loop_vec_info main_loop_vinfo,
2857                      const vector_modes &vector_modes, unsigned &mode_i,
2858                      machine_mode &autodetected_vector_mode,
2859                      bool &fatal)
2860 {
2861   loop_vec_info loop_vinfo
2862     = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
2863
2864   machine_mode vector_mode = vector_modes[mode_i];
2865   loop_vinfo->vector_mode = vector_mode;
2866   unsigned int suggested_unroll_factor = 1;
2867
2868   /* Run the main analysis.  */
2869   opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal,
2870                                         &suggested_unroll_factor);
2871   if (dump_enabled_p ())
2872     dump_printf_loc (MSG_NOTE, vect_location,
2873                      "***** Analysis %s with vector mode %s\n",
2874                      res ? "succeeded" : " failed",
2875                      GET_MODE_NAME (loop_vinfo->vector_mode));
2876
2877   if (!main_loop_vinfo && suggested_unroll_factor > 1)
2878     {
2879       if (dump_enabled_p ())
2880         dump_printf_loc (MSG_NOTE, vect_location,
2881                          "***** Re-trying analysis for unrolling"
2882                          " with unroll factor %d.\n",
2883                          suggested_unroll_factor);
2884       loop_vec_info unroll_vinfo
2885         = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
2886       unroll_vinfo->vector_mode = vector_mode;
2887       unroll_vinfo->suggested_unroll_factor = suggested_unroll_factor;
2888       opt_result new_res = vect_analyze_loop_2 (unroll_vinfo, fatal, NULL);
2889       if (new_res)
2890         {
2891           delete loop_vinfo;
2892           loop_vinfo = unroll_vinfo;
2893         }
2894       else
2895         delete unroll_vinfo;
2896     }
2897
2898   /* Remember the autodetected vector mode.  */
2899   if (vector_mode == VOIDmode)
2900     autodetected_vector_mode = loop_vinfo->vector_mode;
2901
2902   /* Advance mode_i, first skipping modes that would result in the
2903      same analysis result.  */
2904   while (mode_i + 1 < vector_modes.length ()
2905          && vect_chooses_same_modes_p (loop_vinfo,
2906                                        vector_modes[mode_i + 1]))
2907     {
2908       if (dump_enabled_p ())
2909         dump_printf_loc (MSG_NOTE, vect_location,
2910                          "***** The result for vector mode %s would"
2911                          " be the same\n",
2912                          GET_MODE_NAME (vector_modes[mode_i + 1]));
2913       mode_i += 1;
2914     }
2915   if (mode_i + 1 < vector_modes.length ()
2916       && VECTOR_MODE_P (autodetected_vector_mode)
2917       && (related_vector_mode (vector_modes[mode_i + 1],
2918                                GET_MODE_INNER (autodetected_vector_mode))
2919           == autodetected_vector_mode)
2920       && (related_vector_mode (autodetected_vector_mode,
2921                                GET_MODE_INNER (vector_modes[mode_i + 1]))
2922           == vector_modes[mode_i + 1]))
2923     {
2924       if (dump_enabled_p ())
2925         dump_printf_loc (MSG_NOTE, vect_location,
2926                          "***** Skipping vector mode %s, which would"
2927                          " repeat the analysis for %s\n",
2928                          GET_MODE_NAME (vector_modes[mode_i + 1]),
2929                          GET_MODE_NAME (autodetected_vector_mode));
2930       mode_i += 1;
2931     }
2932   mode_i++;
2933
2934   if (!res)
2935     {
2936       delete loop_vinfo;
2937       if (fatal)
2938         gcc_checking_assert (main_loop_vinfo == NULL);
2939       return opt_loop_vec_info::propagate_failure (res);
2940     }
2941
2942   return opt_loop_vec_info::success (loop_vinfo);
2943 }
2944
2945 /* Function vect_analyze_loop.
2946
2947    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2948    for it.  The different analyses will record information in the
2949    loop_vec_info struct.  */
2950 opt_loop_vec_info
2951 vect_analyze_loop (class loop *loop, vec_info_shared *shared)
2952 {
2953   DUMP_VECT_SCOPE ("analyze_loop_nest");
2954
2955   if (loop_outer (loop)
2956       && loop_vec_info_for_loop (loop_outer (loop))
2957       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2958     return opt_loop_vec_info::failure_at (vect_location,
2959                                           "outer-loop already vectorized.\n");
2960
2961   if (!find_loop_nest (loop, &shared->loop_nest))
2962     return opt_loop_vec_info::failure_at
2963       (vect_location,
2964        "not vectorized: loop nest containing two or more consecutive inner"
2965        " loops cannot be vectorized\n");
2966
2967   /* Analyze the loop form.  */
2968   vect_loop_form_info loop_form_info;
2969   opt_result res = vect_analyze_loop_form (loop, &loop_form_info);
2970   if (!res)
2971     {
2972       if (dump_enabled_p ())
2973         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2974                          "bad loop form.\n");
2975       return opt_loop_vec_info::propagate_failure (res);
2976     }
2977   if (!integer_onep (loop_form_info.assumptions))
2978     {
2979       /* We consider to vectorize this loop by versioning it under
2980          some assumptions.  In order to do this, we need to clear
2981          existing information computed by scev and niter analyzer.  */
2982       scev_reset_htab ();
2983       free_numbers_of_iterations_estimates (loop);
2984       /* Also set flag for this loop so that following scev and niter
2985          analysis are done under the assumptions.  */
2986       loop_constraint_set (loop, LOOP_C_FINITE);
2987     }
2988
2989   auto_vector_modes vector_modes;
2990   /* Autodetect first vector size we try.  */
2991   vector_modes.safe_push (VOIDmode);
2992   unsigned int autovec_flags
2993     = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
2994                                                     loop->simdlen != 0);
2995   bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
2996                              && !unlimited_cost_model (loop));
2997   machine_mode autodetected_vector_mode = VOIDmode;
2998   opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
2999   unsigned int mode_i = 0;
3000   unsigned HOST_WIDE_INT simdlen = loop->simdlen;
3001
3002   /* Keep track of the VF for each mode.  Initialize all to 0 which indicates
3003      a mode has not been analyzed.  */
3004   auto_vec<poly_uint64, 8> cached_vf_per_mode;
3005   for (unsigned i = 0; i < vector_modes.length (); ++i)
3006     cached_vf_per_mode.safe_push (0);
3007
3008   /* First determine the main loop vectorization mode, either the first
3009      one that works, starting with auto-detecting the vector mode and then
3010      following the targets order of preference, or the one with the
3011      lowest cost if pick_lowest_cost_p.  */
3012   while (1)
3013     {
3014       bool fatal;
3015       unsigned int last_mode_i = mode_i;
3016       /* Set cached VF to -1 prior to analysis, which indicates a mode has
3017          failed.  */
3018       cached_vf_per_mode[last_mode_i] = -1;
3019       opt_loop_vec_info loop_vinfo
3020         = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3021                                NULL, vector_modes, mode_i,
3022                                autodetected_vector_mode, fatal);
3023       if (fatal)
3024         break;
3025
3026       if (loop_vinfo)
3027         {
3028           /*  Analyzis has been successful so update the VF value.  The
3029               VF should always be a multiple of unroll_factor and we want to
3030               capture the original VF here.  */
3031           cached_vf_per_mode[last_mode_i]
3032             = exact_div (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
3033                          loop_vinfo->suggested_unroll_factor);
3034           /* Once we hit the desired simdlen for the first time,
3035              discard any previous attempts.  */
3036           if (simdlen
3037               && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
3038             {
3039               delete first_loop_vinfo;
3040               first_loop_vinfo = opt_loop_vec_info::success (NULL);
3041               simdlen = 0;
3042             }
3043           else if (pick_lowest_cost_p
3044                    && first_loop_vinfo
3045                    && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
3046             {
3047               /* Pick loop_vinfo over first_loop_vinfo.  */
3048               delete first_loop_vinfo;
3049               first_loop_vinfo = opt_loop_vec_info::success (NULL);
3050             }
3051           if (first_loop_vinfo == NULL)
3052             first_loop_vinfo = loop_vinfo;
3053           else
3054             {
3055               delete loop_vinfo;
3056               loop_vinfo = opt_loop_vec_info::success (NULL);
3057             }
3058
3059           /* Commit to first_loop_vinfo if we have no reason to try
3060              alternatives.  */
3061           if (!simdlen && !pick_lowest_cost_p)
3062             break;
3063         }
3064       if (mode_i == vector_modes.length ()
3065           || autodetected_vector_mode == VOIDmode)
3066         break;
3067
3068       /* Try the next biggest vector size.  */
3069       if (dump_enabled_p ())
3070         dump_printf_loc (MSG_NOTE, vect_location,
3071                          "***** Re-trying analysis with vector mode %s\n",
3072                          GET_MODE_NAME (vector_modes[mode_i]));
3073     }
3074   if (!first_loop_vinfo)
3075     return opt_loop_vec_info::propagate_failure (res);
3076
3077   if (dump_enabled_p ())
3078     dump_printf_loc (MSG_NOTE, vect_location,
3079                      "***** Choosing vector mode %s\n",
3080                      GET_MODE_NAME (first_loop_vinfo->vector_mode));
3081
3082   /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
3083      enabled, SIMDUID is not set, it is the innermost loop and we have
3084      either already found the loop's SIMDLEN or there was no SIMDLEN to
3085      begin with.
3086      TODO: Enable epilogue vectorization for loops with SIMDUID set.  */
3087   bool vect_epilogues = (!simdlen
3088                          && loop->inner == NULL
3089                          && param_vect_epilogues_nomask
3090                          && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
3091                          && !loop->simduid);
3092   if (!vect_epilogues)
3093     return first_loop_vinfo;
3094
3095   /* Now analyze first_loop_vinfo for epilogue vectorization.  */
3096   poly_uint64 lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
3097
3098   /* For epilogues start the analysis from the first mode.  The motivation
3099      behind starting from the beginning comes from cases where the VECTOR_MODES
3100      array may contain length-agnostic and length-specific modes.  Their
3101      ordering is not guaranteed, so we could end up picking a mode for the main
3102      loop that is after the epilogue's optimal mode.  */
3103   vector_modes[0] = autodetected_vector_mode;
3104   mode_i = 0;
3105
3106   bool supports_partial_vectors =
3107     partial_vectors_supported_p () && param_vect_partial_vector_usage != 0;
3108   poly_uint64 first_vinfo_vf = LOOP_VINFO_VECT_FACTOR (first_loop_vinfo);
3109
3110   while (1)
3111     {
3112       /* If the target does not support partial vectors we can shorten the
3113          number of modes to analyze for the epilogue as we know we can't pick a
3114          mode that would lead to a VF at least as big as the
3115          FIRST_VINFO_VF.  */
3116       if (!supports_partial_vectors
3117           && maybe_ge (cached_vf_per_mode[mode_i], first_vinfo_vf))
3118         {
3119           mode_i++;
3120           if (mode_i == vector_modes.length ())
3121             break;
3122           continue;
3123         }
3124
3125       if (dump_enabled_p ())
3126         dump_printf_loc (MSG_NOTE, vect_location,
3127                          "***** Re-trying epilogue analysis with vector "
3128                          "mode %s\n", GET_MODE_NAME (vector_modes[mode_i]));
3129
3130       bool fatal;
3131       opt_loop_vec_info loop_vinfo
3132         = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3133                                first_loop_vinfo,
3134                                vector_modes, mode_i,
3135                                autodetected_vector_mode, fatal);
3136       if (fatal)
3137         break;
3138
3139       if (loop_vinfo)
3140         {
3141           if (pick_lowest_cost_p)
3142             {
3143               /* Keep trying to roll back vectorization attempts while the
3144                  loop_vec_infos they produced were worse than this one.  */
3145               vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
3146               while (!vinfos.is_empty ()
3147                      && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
3148                 {
3149                   gcc_assert (vect_epilogues);
3150                   delete vinfos.pop ();
3151                 }
3152             }
3153           /* For now only allow one epilogue loop.  */
3154           if (first_loop_vinfo->epilogue_vinfos.is_empty ())
3155             {
3156               first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
3157               poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3158               gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
3159                           || maybe_ne (lowest_th, 0U));
3160               /* Keep track of the known smallest versioning
3161                  threshold.  */
3162               if (ordered_p (lowest_th, th))
3163                 lowest_th = ordered_min (lowest_th, th);
3164             }
3165           else
3166             {
3167               delete loop_vinfo;
3168               loop_vinfo = opt_loop_vec_info::success (NULL);
3169             }
3170
3171           /* For now only allow one epilogue loop, but allow
3172              pick_lowest_cost_p to replace it, so commit to the
3173              first epilogue if we have no reason to try alternatives.  */
3174           if (!pick_lowest_cost_p)
3175             break;
3176         }
3177
3178       if (mode_i == vector_modes.length ())
3179         break;
3180
3181     }
3182
3183   if (!first_loop_vinfo->epilogue_vinfos.is_empty ())
3184     {
3185       LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
3186       if (dump_enabled_p ())
3187         dump_printf_loc (MSG_NOTE, vect_location,
3188                          "***** Choosing epilogue vector mode %s\n",
3189                          GET_MODE_NAME
3190                            (first_loop_vinfo->epilogue_vinfos[0]->vector_mode));
3191     }
3192
3193   return first_loop_vinfo;
3194 }
3195
3196 /* Return true if there is an in-order reduction function for CODE, storing
3197    it in *REDUC_FN if so.  */
3198
3199 static bool
3200 fold_left_reduction_fn (code_helper code, internal_fn *reduc_fn)
3201 {
3202   if (code == PLUS_EXPR)
3203     {
3204       *reduc_fn = IFN_FOLD_LEFT_PLUS;
3205       return true;
3206     }
3207   return false;
3208 }
3209
3210 /* Function reduction_fn_for_scalar_code
3211
3212    Input:
3213    CODE - tree_code of a reduction operations.
3214
3215    Output:
3216    REDUC_FN - the corresponding internal function to be used to reduce the
3217       vector of partial results into a single scalar result, or IFN_LAST
3218       if the operation is a supported reduction operation, but does not have
3219       such an internal function.
3220
3221    Return FALSE if CODE currently cannot be vectorized as reduction.  */
3222
3223 bool
3224 reduction_fn_for_scalar_code (code_helper code, internal_fn *reduc_fn)
3225 {
3226   if (code.is_tree_code ())
3227     switch (tree_code (code))
3228       {
3229       case MAX_EXPR:
3230         *reduc_fn = IFN_REDUC_MAX;
3231         return true;
3232
3233       case MIN_EXPR:
3234         *reduc_fn = IFN_REDUC_MIN;
3235         return true;
3236
3237       case PLUS_EXPR:
3238         *reduc_fn = IFN_REDUC_PLUS;
3239         return true;
3240
3241       case BIT_AND_EXPR:
3242         *reduc_fn = IFN_REDUC_AND;
3243         return true;
3244
3245       case BIT_IOR_EXPR:
3246         *reduc_fn = IFN_REDUC_IOR;
3247         return true;
3248
3249       case BIT_XOR_EXPR:
3250         *reduc_fn = IFN_REDUC_XOR;
3251         return true;
3252
3253       case MULT_EXPR:
3254       case MINUS_EXPR:
3255         *reduc_fn = IFN_LAST;
3256         return true;
3257
3258       default:
3259         return false;
3260       }
3261   else
3262     switch (combined_fn (code))
3263       {
3264       CASE_CFN_FMAX:
3265         *reduc_fn = IFN_REDUC_FMAX;
3266         return true;
3267
3268       CASE_CFN_FMIN:
3269         *reduc_fn = IFN_REDUC_FMIN;
3270         return true;
3271
3272       default:
3273         return false;
3274       }
3275 }
3276
3277 /* If there is a neutral value X such that a reduction would not be affected
3278    by the introduction of additional X elements, return that X, otherwise
3279    return null.  CODE is the code of the reduction and SCALAR_TYPE is type
3280    of the scalar elements.  If the reduction has just a single initial value
3281    then INITIAL_VALUE is that value, otherwise it is null.  */
3282
3283 tree
3284 neutral_op_for_reduction (tree scalar_type, code_helper code,
3285                           tree initial_value)
3286 {
3287   if (code.is_tree_code ())
3288     switch (tree_code (code))
3289       {
3290       case WIDEN_SUM_EXPR:
3291       case DOT_PROD_EXPR:
3292       case SAD_EXPR:
3293       case PLUS_EXPR:
3294       case MINUS_EXPR:
3295       case BIT_IOR_EXPR:
3296       case BIT_XOR_EXPR:
3297         return build_zero_cst (scalar_type);
3298
3299       case MULT_EXPR:
3300         return build_one_cst (scalar_type);
3301
3302       case BIT_AND_EXPR:
3303         return build_all_ones_cst (scalar_type);
3304
3305       case MAX_EXPR:
3306       case MIN_EXPR:
3307         return initial_value;
3308
3309       default:
3310         return NULL_TREE;
3311       }
3312   else
3313     switch (combined_fn (code))
3314       {
3315       CASE_CFN_FMIN:
3316       CASE_CFN_FMAX:
3317         return initial_value;
3318
3319       default:
3320         return NULL_TREE;
3321       }
3322 }
3323
3324 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
3325    STMT is printed with a message MSG. */
3326
3327 static void
3328 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3329 {
3330   dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3331 }
3332
3333 /* Return true if we need an in-order reduction for operation CODE
3334    on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3335    overflow must wrap.  */
3336
3337 bool
3338 needs_fold_left_reduction_p (tree type, code_helper code)
3339 {
3340   /* CHECKME: check for !flag_finite_math_only too?  */
3341   if (SCALAR_FLOAT_TYPE_P (type))
3342     {
3343       if (code.is_tree_code ())
3344         switch (tree_code (code))
3345           {
3346           case MIN_EXPR:
3347           case MAX_EXPR:
3348             return false;
3349
3350           default:
3351             return !flag_associative_math;
3352           }
3353       else
3354         switch (combined_fn (code))
3355           {
3356           CASE_CFN_FMIN:
3357           CASE_CFN_FMAX:
3358             return false;
3359
3360           default:
3361             return !flag_associative_math;
3362           }
3363     }
3364
3365   if (INTEGRAL_TYPE_P (type))
3366     return (!code.is_tree_code ()
3367             || !operation_no_trapping_overflow (type, tree_code (code)));
3368
3369   if (SAT_FIXED_POINT_TYPE_P (type))
3370     return true;
3371
3372   return false;
3373 }
3374
3375 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3376    has a handled computation expression.  Store the main reduction
3377    operation in *CODE.  */
3378
3379 static bool
3380 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3381                       tree loop_arg, code_helper *code,
3382                       vec<std::pair<ssa_op_iter, use_operand_p> > &path)
3383 {
3384   auto_bitmap visited;
3385   tree lookfor = PHI_RESULT (phi);
3386   ssa_op_iter curri;
3387   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
3388   while (USE_FROM_PTR (curr) != loop_arg)
3389     curr = op_iter_next_use (&curri);
3390   curri.i = curri.numops;
3391   do
3392     {
3393       path.safe_push (std::make_pair (curri, curr));
3394       tree use = USE_FROM_PTR (curr);
3395       if (use == lookfor)
3396         break;
3397       gimple *def = SSA_NAME_DEF_STMT (use);
3398       if (gimple_nop_p (def)
3399           || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3400         {
3401 pop:
3402           do
3403             {
3404               std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3405               curri = x.first;
3406               curr = x.second;
3407               do
3408                 curr = op_iter_next_use (&curri);
3409               /* Skip already visited or non-SSA operands (from iterating
3410                  over PHI args).  */
3411               while (curr != NULL_USE_OPERAND_P
3412                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3413                          || ! bitmap_set_bit (visited,
3414                                               SSA_NAME_VERSION
3415                                                 (USE_FROM_PTR (curr)))));
3416             }
3417           while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
3418           if (curr == NULL_USE_OPERAND_P)
3419             break;
3420         }
3421       else
3422         {
3423           if (gimple_code (def) == GIMPLE_PHI)
3424             curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
3425           else
3426             curr = op_iter_init_use (&curri, def, SSA_OP_USE);
3427           while (curr != NULL_USE_OPERAND_P
3428                  && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3429                      || ! bitmap_set_bit (visited,
3430                                           SSA_NAME_VERSION
3431                                             (USE_FROM_PTR (curr)))))
3432             curr = op_iter_next_use (&curri);
3433           if (curr == NULL_USE_OPERAND_P)
3434             goto pop;
3435         }
3436     }
3437   while (1);
3438   if (dump_file && (dump_flags & TDF_DETAILS))
3439     {
3440       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
3441       unsigned i;
3442       std::pair<ssa_op_iter, use_operand_p> *x;
3443       FOR_EACH_VEC_ELT (path, i, x)
3444         dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
3445       dump_printf (MSG_NOTE, "\n");
3446     }
3447
3448   /* Check whether the reduction path detected is valid.  */
3449   bool fail = path.length () == 0;
3450   bool neg = false;
3451   int sign = -1;
3452   *code = ERROR_MARK;
3453   for (unsigned i = 1; i < path.length (); ++i)
3454     {
3455       gimple *use_stmt = USE_STMT (path[i].second);
3456       gimple_match_op op;
3457       if (!gimple_extract_op (use_stmt, &op))
3458         {
3459           fail = true;
3460           break;
3461         }
3462       unsigned int opi = op.num_ops;
3463       if (gassign *assign = dyn_cast<gassign *> (use_stmt))
3464         {
3465           /* The following make sure we can compute the operand index
3466              easily plus it mostly disallows chaining via COND_EXPR condition
3467              operands.  */
3468           for (opi = 0; opi < op.num_ops; ++opi)
3469             if (gimple_assign_rhs1_ptr (assign) + opi == path[i].second->use)
3470               break;
3471         }
3472       else if (gcall *call = dyn_cast<gcall *> (use_stmt))
3473         {
3474           for (opi = 0; opi < op.num_ops; ++opi)
3475             if (gimple_call_arg_ptr (call, opi) == path[i].second->use)
3476               break;
3477         }
3478       if (opi == op.num_ops)
3479         {
3480           fail = true;
3481           break;
3482         }
3483       op.code = canonicalize_code (op.code, op.type);
3484       if (op.code == MINUS_EXPR)
3485         {
3486           op.code = PLUS_EXPR;
3487           /* Track whether we negate the reduction value each iteration.  */
3488           if (op.ops[1] == op.ops[opi])
3489             neg = ! neg;
3490         }
3491       if (CONVERT_EXPR_CODE_P (op.code)
3492           && tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
3493         ;
3494       else if (*code == ERROR_MARK)
3495         {
3496           *code = op.code;
3497           sign = TYPE_SIGN (op.type);
3498         }
3499       else if (op.code != *code)
3500         {
3501           fail = true;
3502           break;
3503         }
3504       else if ((op.code == MIN_EXPR
3505                 || op.code == MAX_EXPR)
3506                && sign != TYPE_SIGN (op.type))
3507         {
3508           fail = true;
3509           break;
3510         }
3511       /* Check there's only a single stmt the op is used on.  For the
3512          not value-changing tail and the last stmt allow out-of-loop uses.
3513          ???  We could relax this and handle arbitrary live stmts by
3514          forcing a scalar epilogue for example.  */
3515       imm_use_iterator imm_iter;
3516       gimple *op_use_stmt;
3517       unsigned cnt = 0;
3518       FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op.ops[opi])
3519         if (!is_gimple_debug (op_use_stmt)
3520             && (*code != ERROR_MARK
3521                 || flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt))))
3522           {
3523             /* We want to allow x + x but not x < 1 ? x : 2.  */
3524             if (is_gimple_assign (op_use_stmt)
3525                 && gimple_assign_rhs_code (op_use_stmt) == COND_EXPR)
3526               {
3527                 use_operand_p use_p;
3528                 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
3529                   cnt++;
3530               }
3531             else
3532               cnt++;
3533           }
3534       if (cnt != 1)
3535         {
3536           fail = true;
3537           break;
3538         }
3539     }
3540   return ! fail && ! neg && *code != ERROR_MARK;
3541 }
3542
3543 bool
3544 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3545                       tree loop_arg, enum tree_code code)
3546 {
3547   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3548   code_helper code_;
3549   return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
3550           && code_ == code);
3551 }
3552
3553
3554
3555 /* Function vect_is_simple_reduction
3556
3557    (1) Detect a cross-iteration def-use cycle that represents a simple
3558    reduction computation.  We look for the following pattern:
3559
3560    loop_header:
3561      a1 = phi < a0, a2 >
3562      a3 = ...
3563      a2 = operation (a3, a1)
3564
3565    or
3566
3567    a3 = ...
3568    loop_header:
3569      a1 = phi < a0, a2 >
3570      a2 = operation (a3, a1)
3571
3572    such that:
3573    1. operation is commutative and associative and it is safe to
3574       change the order of the computation
3575    2. no uses for a2 in the loop (a2 is used out of the loop)
3576    3. no uses of a1 in the loop besides the reduction operation
3577    4. no uses of a1 outside the loop.
3578
3579    Conditions 1,4 are tested here.
3580    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
3581
3582    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
3583    nested cycles.
3584
3585    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
3586    reductions:
3587
3588      a1 = phi < a0, a2 >
3589      inner loop (def of a3)
3590      a2 = phi < a3 >
3591
3592    (4) Detect condition expressions, ie:
3593      for (int i = 0; i < N; i++)
3594        if (a[i] < val)
3595         ret_val = a[i];
3596
3597 */
3598
3599 static stmt_vec_info
3600 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3601                           bool *double_reduc, bool *reduc_chain_p)
3602 {
3603   gphi *phi = as_a <gphi *> (phi_info->stmt);
3604   gimple *phi_use_stmt = NULL;
3605   imm_use_iterator imm_iter;
3606   use_operand_p use_p;
3607
3608   *double_reduc = false;
3609   *reduc_chain_p = false;
3610   STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
3611
3612   tree phi_name = PHI_RESULT (phi);
3613   /* ???  If there are no uses of the PHI result the inner loop reduction
3614      won't be detected as possibly double-reduction by vectorizable_reduction
3615      because that tries to walk the PHI arg from the preheader edge which
3616      can be constant.  See PR60382.  */
3617   if (has_zero_uses (phi_name))
3618     return NULL;
3619   class loop *loop = (gimple_bb (phi))->loop_father;
3620   unsigned nphi_def_loop_uses = 0;
3621   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
3622     {
3623       gimple *use_stmt = USE_STMT (use_p);
3624       if (is_gimple_debug (use_stmt))
3625         continue;
3626
3627       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3628         {
3629           if (dump_enabled_p ())
3630             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3631                              "intermediate value used outside loop.\n");
3632
3633           return NULL;
3634         }
3635
3636       nphi_def_loop_uses++;
3637       phi_use_stmt = use_stmt;
3638     }
3639
3640   tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
3641   if (TREE_CODE (latch_def) != SSA_NAME)
3642     {
3643       if (dump_enabled_p ())
3644         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3645                          "reduction: not ssa_name: %T\n", latch_def);
3646       return NULL;
3647     }
3648
3649   stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
3650   if (!def_stmt_info
3651       || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
3652     return NULL;
3653
3654   bool nested_in_vect_loop
3655     = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
3656   unsigned nlatch_def_loop_uses = 0;
3657   auto_vec<gphi *, 3> lcphis;
3658   bool inner_loop_of_double_reduc = false;
3659   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
3660     {
3661       gimple *use_stmt = USE_STMT (use_p);
3662       if (is_gimple_debug (use_stmt))
3663         continue;
3664       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3665         nlatch_def_loop_uses++;
3666       else
3667         {
3668           /* We can have more than one loop-closed PHI.  */
3669           lcphis.safe_push (as_a <gphi *> (use_stmt));
3670           if (nested_in_vect_loop
3671               && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
3672                   == vect_double_reduction_def))
3673             inner_loop_of_double_reduc = true;
3674         }
3675     }
3676
3677   /* If we are vectorizing an inner reduction we are executing that
3678      in the original order only in case we are not dealing with a
3679      double reduction.  */
3680   if (nested_in_vect_loop && !inner_loop_of_double_reduc)
3681     {
3682       if (dump_enabled_p ())
3683         report_vect_op (MSG_NOTE, def_stmt_info->stmt,
3684                         "detected nested cycle: ");
3685       return def_stmt_info;
3686     }
3687
3688   /* When the inner loop of a double reduction ends up with more than
3689      one loop-closed PHI we have failed to classify alternate such
3690      PHIs as double reduction, leading to wrong code.  See PR103237.  */
3691   if (inner_loop_of_double_reduc && lcphis.length () != 1)
3692     {
3693       if (dump_enabled_p ())
3694         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3695                          "unhandle double reduction\n");
3696       return NULL;
3697     }
3698
3699   /* If this isn't a nested cycle or if the nested cycle reduction value
3700      is used ouside of the inner loop we cannot handle uses of the reduction
3701      value.  */
3702   if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
3703     {
3704       if (dump_enabled_p ())
3705         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3706                          "reduction used in loop.\n");
3707       return NULL;
3708     }
3709
3710   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3711      defined in the inner loop.  */
3712   if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
3713     {
3714       tree op1 = PHI_ARG_DEF (def_stmt, 0);
3715       if (gimple_phi_num_args (def_stmt) != 1
3716           || TREE_CODE (op1) != SSA_NAME)
3717         {
3718           if (dump_enabled_p ())
3719             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3720                              "unsupported phi node definition.\n");
3721
3722           return NULL;
3723         }
3724
3725       gimple *def1 = SSA_NAME_DEF_STMT (op1);
3726       if (gimple_bb (def1)
3727           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3728           && loop->inner
3729           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3730           && (is_gimple_assign (def1) || is_gimple_call (def1))
3731           && is_a <gphi *> (phi_use_stmt)
3732           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3733         {
3734           if (dump_enabled_p ())
3735             report_vect_op (MSG_NOTE, def_stmt,
3736                             "detected double reduction: ");
3737
3738           *double_reduc = true;
3739           return def_stmt_info;
3740         }
3741
3742       return NULL;
3743     }
3744
3745   /* Look for the expression computing latch_def from then loop PHI result.  */
3746   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3747   code_helper code;
3748   if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
3749                             path))
3750     {
3751       STMT_VINFO_REDUC_CODE (phi_info) = code;
3752       if (code == COND_EXPR && !nested_in_vect_loop)
3753         STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
3754
3755       /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
3756          reduction chain for which the additional restriction is that
3757          all operations in the chain are the same.  */
3758       auto_vec<stmt_vec_info, 8> reduc_chain;
3759       unsigned i;
3760       bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
3761       for (i = path.length () - 1; i >= 1; --i)
3762         {
3763           gimple *stmt = USE_STMT (path[i].second);
3764           stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
3765           gimple_match_op op;
3766           if (!gimple_extract_op (stmt, &op))
3767             gcc_unreachable ();
3768           if (gassign *assign = dyn_cast<gassign *> (stmt))
3769             STMT_VINFO_REDUC_IDX (stmt_info)
3770               = path[i].second->use - gimple_assign_rhs1_ptr (assign);
3771           else
3772             {
3773               gcall *call = as_a<gcall *> (stmt);
3774               STMT_VINFO_REDUC_IDX (stmt_info)
3775                 = path[i].second->use - gimple_call_arg_ptr (call, 0);
3776             }
3777           bool leading_conversion = (CONVERT_EXPR_CODE_P (op.code)
3778                                      && (i == 1 || i == path.length () - 1));
3779           if ((op.code != code && !leading_conversion)
3780               /* We can only handle the final value in epilogue
3781                  generation for reduction chains.  */
3782               || (i != 1 && !has_single_use (gimple_get_lhs (stmt))))
3783             is_slp_reduc = false;
3784           /* For reduction chains we support a trailing/leading
3785              conversions.  We do not store those in the actual chain.  */
3786           if (leading_conversion)
3787             continue;
3788           reduc_chain.safe_push (stmt_info);
3789         }
3790       if (is_slp_reduc && reduc_chain.length () > 1)
3791         {
3792           for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
3793             {
3794               REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
3795               REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
3796             }
3797           REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
3798           REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
3799
3800           /* Save the chain for further analysis in SLP detection.  */
3801           LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
3802           REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
3803
3804           *reduc_chain_p = true;
3805           if (dump_enabled_p ())
3806             dump_printf_loc (MSG_NOTE, vect_location,
3807                             "reduction: detected reduction chain\n");
3808         }
3809       else if (dump_enabled_p ())
3810         dump_printf_loc (MSG_NOTE, vect_location,
3811                          "reduction: detected reduction\n");
3812
3813       return def_stmt_info;
3814     }
3815
3816   if (dump_enabled_p ())
3817     dump_printf_loc (MSG_NOTE, vect_location,
3818                      "reduction: unknown pattern\n");
3819
3820   return NULL;
3821 }
3822
3823 /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
3824    PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
3825    or -1 if not known.  */
3826
3827 static int
3828 vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
3829 {
3830   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3831   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
3832     {
3833       if (dump_enabled_p ())
3834         dump_printf_loc (MSG_NOTE, vect_location,
3835                          "cost model: epilogue peel iters set to vf/2 "
3836                          "because loop iterations are unknown .\n");
3837       return assumed_vf / 2;
3838     }
3839   else
3840     {
3841       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3842       peel_iters_prologue = MIN (niters, peel_iters_prologue);
3843       int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3844       /* If we need to peel for gaps, but no peeling is required, we have to
3845          peel VF iterations.  */
3846       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
3847         peel_iters_epilogue = assumed_vf;
3848       return peel_iters_epilogue;
3849     }
3850 }
3851
3852 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
3853 int
3854 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3855                              int *peel_iters_epilogue,
3856                              stmt_vector_for_cost *scalar_cost_vec,
3857                              stmt_vector_for_cost *prologue_cost_vec,
3858                              stmt_vector_for_cost *epilogue_cost_vec)
3859 {
3860   int retval = 0;
3861
3862   *peel_iters_epilogue
3863     = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
3864
3865   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3866     {
3867       /* If peeled iterations are known but number of scalar loop
3868          iterations are unknown, count a taken branch per peeled loop.  */
3869       if (peel_iters_prologue > 0)
3870         retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3871                                    vect_prologue);
3872       if (*peel_iters_epilogue > 0)
3873         retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
3874                                     vect_epilogue);
3875     }
3876
3877   stmt_info_for_cost *si;
3878   int j;
3879   if (peel_iters_prologue)
3880     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3881       retval += record_stmt_cost (prologue_cost_vec,
3882                                   si->count * peel_iters_prologue,
3883                                   si->kind, si->stmt_info, si->misalign,
3884                                   vect_prologue);
3885   if (*peel_iters_epilogue)
3886     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3887       retval += record_stmt_cost (epilogue_cost_vec,
3888                                   si->count * *peel_iters_epilogue,
3889                                   si->kind, si->stmt_info, si->misalign,
3890                                   vect_epilogue);
3891
3892   return retval;
3893 }
3894
3895 /* Function vect_estimate_min_profitable_iters
3896
3897    Return the number of iterations required for the vector version of the
3898    loop to be profitable relative to the cost of the scalar version of the
3899    loop.
3900
3901    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3902    of iterations for vectorization.  -1 value means loop vectorization
3903    is not profitable.  This returned value may be used for dynamic
3904    profitability check.
3905
3906    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3907    for static check against estimated number of iterations.  */
3908
3909 static void
3910 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3911                                     int *ret_min_profitable_niters,
3912                                     int *ret_min_profitable_estimate,
3913                                     unsigned *suggested_unroll_factor)
3914 {
3915   int min_profitable_iters;
3916   int min_profitable_estimate;
3917   int peel_iters_prologue;
3918   int peel_iters_epilogue;
3919   unsigned vec_inside_cost = 0;
3920   int vec_outside_cost = 0;
3921   unsigned vec_prologue_cost = 0;
3922   unsigned vec_epilogue_cost = 0;
3923   int scalar_single_iter_cost = 0;
3924   int scalar_outside_cost = 0;
3925   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3926   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3927   vector_costs *target_cost_data = loop_vinfo->vector_costs;
3928
3929   /* Cost model disabled.  */
3930   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3931     {
3932       if (dump_enabled_p ())
3933         dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3934       *ret_min_profitable_niters = 0;
3935       *ret_min_profitable_estimate = 0;
3936       return;
3937     }
3938
3939   /* Requires loop versioning tests to handle misalignment.  */
3940   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3941     {
3942       /*  FIXME: Make cost depend on complexity of individual check.  */
3943       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3944       (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
3945       if (dump_enabled_p ())
3946         dump_printf (MSG_NOTE,
3947                      "cost model: Adding cost of checks for loop "
3948                      "versioning to treat misalignment.\n");
3949     }
3950
3951   /* Requires loop versioning with alias checks.  */
3952   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3953     {
3954       /*  FIXME: Make cost depend on complexity of individual check.  */
3955       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3956       (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
3957       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3958       if (len)
3959         /* Count LEN - 1 ANDs and LEN comparisons.  */
3960         (void) add_stmt_cost (target_cost_data, len * 2 - 1,
3961                               scalar_stmt, vect_prologue);
3962       len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3963       if (len)
3964         {
3965           /* Count LEN - 1 ANDs and LEN comparisons.  */
3966           unsigned int nstmts = len * 2 - 1;
3967           /* +1 for each bias that needs adding.  */
3968           for (unsigned int i = 0; i < len; ++i)
3969             if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3970               nstmts += 1;
3971           (void) add_stmt_cost (target_cost_data, nstmts,
3972                                 scalar_stmt, vect_prologue);
3973         }
3974       if (dump_enabled_p ())
3975         dump_printf (MSG_NOTE,
3976                      "cost model: Adding cost of checks for loop "
3977                      "versioning aliasing.\n");
3978     }
3979
3980   /* Requires loop versioning with niter checks.  */
3981   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3982     {
3983       /*  FIXME: Make cost depend on complexity of individual check.  */
3984       (void) add_stmt_cost (target_cost_data, 1, vector_stmt,
3985                             NULL, NULL, NULL_TREE, 0, vect_prologue);
3986       if (dump_enabled_p ())
3987         dump_printf (MSG_NOTE,
3988                      "cost model: Adding cost of checks for loop "
3989                      "versioning niters.\n");
3990     }
3991
3992   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3993     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3994                           vect_prologue);
3995
3996   /* Count statements in scalar loop.  Using this as scalar cost for a single
3997      iteration for now.
3998
3999      TODO: Add outer loop support.
4000
4001      TODO: Consider assigning different costs to different scalar
4002      statements.  */
4003
4004   scalar_single_iter_cost = loop_vinfo->scalar_costs->total_cost ();
4005
4006   /* Add additional cost for the peeled instructions in prologue and epilogue
4007      loop.  (For fully-masked loops there will be no peeling.)
4008
4009      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
4010      at compile-time - we assume it's vf/2 (the worst would be vf-1).
4011
4012      TODO: Build an expression that represents peel_iters for prologue and
4013      epilogue to be used in a run-time test.  */
4014
4015   bool prologue_need_br_taken_cost = false;
4016   bool prologue_need_br_not_taken_cost = false;
4017
4018   /* Calculate peel_iters_prologue.  */
4019   if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
4020     peel_iters_prologue = 0;
4021   else if (npeel < 0)
4022     {
4023       peel_iters_prologue = assumed_vf / 2;
4024       if (dump_enabled_p ())
4025         dump_printf (MSG_NOTE, "cost model: "
4026                      "prologue peel iters set to vf/2.\n");
4027
4028       /* If peeled iterations are unknown, count a taken branch and a not taken
4029          branch per peeled loop.  Even if scalar loop iterations are known,
4030          vector iterations are not known since peeled prologue iterations are
4031          not known.  Hence guards remain the same.  */
4032       prologue_need_br_taken_cost = true;
4033       prologue_need_br_not_taken_cost = true;
4034     }
4035   else
4036     {
4037       peel_iters_prologue = npeel;
4038       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
4039         /* If peeled iterations are known but number of scalar loop
4040            iterations are unknown, count a taken branch per peeled loop.  */
4041         prologue_need_br_taken_cost = true;
4042     }
4043
4044   bool epilogue_need_br_taken_cost = false;
4045   bool epilogue_need_br_not_taken_cost = false;
4046
4047   /* Calculate peel_iters_epilogue.  */
4048   if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4049     /* We need to peel exactly one iteration for gaps.  */
4050     peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
4051   else if (npeel < 0)
4052     {
4053       /* If peeling for alignment is unknown, loop bound of main loop
4054          becomes unknown.  */
4055       peel_iters_epilogue = assumed_vf / 2;
4056       if (dump_enabled_p ())
4057         dump_printf (MSG_NOTE, "cost model: "
4058                      "epilogue peel iters set to vf/2 because "
4059                      "peeling for alignment is unknown.\n");
4060
4061       /* See the same reason above in peel_iters_prologue calculation.  */
4062       epilogue_need_br_taken_cost = true;
4063       epilogue_need_br_not_taken_cost = true;
4064     }
4065   else
4066     {
4067       peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
4068       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
4069         /* If peeled iterations are known but number of scalar loop
4070            iterations are unknown, count a taken branch per peeled loop.  */
4071         epilogue_need_br_taken_cost = true;
4072     }
4073
4074   stmt_info_for_cost *si;
4075   int j;
4076   /* Add costs associated with peel_iters_prologue.  */
4077   if (peel_iters_prologue)
4078     FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4079       {
4080         (void) add_stmt_cost (target_cost_data,
4081                               si->count * peel_iters_prologue, si->kind,
4082                               si->stmt_info, si->node, si->vectype,
4083                               si->misalign, vect_prologue);
4084       }
4085
4086   /* Add costs associated with peel_iters_epilogue.  */
4087   if (peel_iters_epilogue)
4088     FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4089       {
4090         (void) add_stmt_cost (target_cost_data,
4091                               si->count * peel_iters_epilogue, si->kind,
4092                               si->stmt_info, si->node, si->vectype,
4093                               si->misalign, vect_epilogue);
4094       }
4095
4096   /* Add possible cond_branch_taken/cond_branch_not_taken cost.  */
4097
4098   if (prologue_need_br_taken_cost)
4099     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4100                           vect_prologue);
4101
4102   if (prologue_need_br_not_taken_cost)
4103     (void) add_stmt_cost (target_cost_data, 1,
4104                           cond_branch_not_taken, vect_prologue);
4105
4106   if (epilogue_need_br_taken_cost)
4107     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4108                           vect_epilogue);
4109
4110   if (epilogue_need_br_not_taken_cost)
4111     (void) add_stmt_cost (target_cost_data, 1,
4112                           cond_branch_not_taken, vect_epilogue);
4113
4114   /* Take care of special costs for rgroup controls of partial vectors.  */
4115   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
4116     {
4117       /* Calculate how many masks we need to generate.  */
4118       unsigned int num_masks = 0;
4119       rgroup_controls *rgm;
4120       unsigned int num_vectors_m1;
4121       FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), num_vectors_m1, rgm)
4122         if (rgm->type)
4123           num_masks += num_vectors_m1 + 1;
4124       gcc_assert (num_masks > 0);
4125
4126       /* In the worst case, we need to generate each mask in the prologue
4127          and in the loop body.  One of the loop body mask instructions
4128          replaces the comparison in the scalar loop, and since we don't
4129          count the scalar comparison against the scalar body, we shouldn't
4130          count that vector instruction against the vector body either.
4131
4132          Sometimes we can use unpacks instead of generating prologue
4133          masks and sometimes the prologue mask will fold to a constant,
4134          so the actual prologue cost might be smaller.  However, it's
4135          simpler and safer to use the worst-case cost; if this ends up
4136          being the tie-breaker between vectorizing or not, then it's
4137          probably better not to vectorize.  */
4138       (void) add_stmt_cost (target_cost_data, num_masks,
4139                             vector_stmt, NULL, NULL, NULL_TREE, 0,
4140                             vect_prologue);
4141       (void) add_stmt_cost (target_cost_data, num_masks - 1,
4142                             vector_stmt, NULL, NULL, NULL_TREE, 0,
4143                             vect_body);
4144     }
4145   else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
4146     {
4147       /* Referring to the functions vect_set_loop_condition_partial_vectors
4148          and vect_set_loop_controls_directly, we need to generate each
4149          length in the prologue and in the loop body if required. Although
4150          there are some possible optimizations, we consider the worst case
4151          here.  */
4152
4153       bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
4154       signed char partial_load_store_bias
4155         = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
4156       bool need_iterate_p
4157         = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
4158            && !vect_known_niters_smaller_than_vf (loop_vinfo));
4159
4160       /* Calculate how many statements to be added.  */
4161       unsigned int prologue_stmts = 0;
4162       unsigned int body_stmts = 0;
4163
4164       rgroup_controls *rgc;
4165       unsigned int num_vectors_m1;
4166       FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
4167         if (rgc->type)
4168           {
4169             /* May need one SHIFT for nitems_total computation.  */
4170             unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
4171             if (nitems != 1 && !niters_known_p)
4172               prologue_stmts += 1;
4173
4174             /* May need one MAX and one MINUS for wrap around.  */
4175             if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
4176               prologue_stmts += 2;
4177
4178             /* Need one MAX and one MINUS for each batch limit excepting for
4179                the 1st one.  */
4180             prologue_stmts += num_vectors_m1 * 2;
4181
4182             unsigned int num_vectors = num_vectors_m1 + 1;
4183
4184             /* Need to set up lengths in prologue, only one MIN required
4185                for each since start index is zero.  */
4186             prologue_stmts += num_vectors;
4187
4188             /* If we have a non-zero partial load bias, we need one PLUS
4189                to adjust the load length.  */
4190             if (partial_load_store_bias != 0)
4191               body_stmts += 1;
4192
4193             /* Each may need two MINs and one MINUS to update lengths in body
4194                for next iteration.  */
4195             if (need_iterate_p)
4196               body_stmts += 3 * num_vectors;
4197           }
4198
4199       (void) add_stmt_cost (target_cost_data, prologue_stmts,
4200                             scalar_stmt, vect_prologue);
4201       (void) add_stmt_cost (target_cost_data, body_stmts,
4202                             scalar_stmt, vect_body);
4203     }
4204
4205   /* FORNOW: The scalar outside cost is incremented in one of the
4206      following ways:
4207
4208      1. The vectorizer checks for alignment and aliasing and generates
4209      a condition that allows dynamic vectorization.  A cost model
4210      check is ANDED with the versioning condition.  Hence scalar code
4211      path now has the added cost of the versioning check.
4212
4213        if (cost > th & versioning_check)
4214          jmp to vector code
4215
4216      Hence run-time scalar is incremented by not-taken branch cost.
4217
4218      2. The vectorizer then checks if a prologue is required.  If the
4219      cost model check was not done before during versioning, it has to
4220      be done before the prologue check.
4221
4222        if (cost <= th)
4223          prologue = scalar_iters
4224        if (prologue == 0)
4225          jmp to vector code
4226        else
4227          execute prologue
4228        if (prologue == num_iters)
4229          go to exit
4230
4231      Hence the run-time scalar cost is incremented by a taken branch,
4232      plus a not-taken branch, plus a taken branch cost.
4233
4234      3. The vectorizer then checks if an epilogue is required.  If the
4235      cost model check was not done before during prologue check, it
4236      has to be done with the epilogue check.
4237
4238        if (prologue == 0)
4239          jmp to vector code
4240        else
4241          execute prologue
4242        if (prologue == num_iters)
4243          go to exit
4244        vector code:
4245          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
4246            jmp to epilogue
4247
4248      Hence the run-time scalar cost should be incremented by 2 taken
4249      branches.
4250
4251      TODO: The back end may reorder the BBS's differently and reverse
4252      conditions/branch directions.  Change the estimates below to
4253      something more reasonable.  */
4254
4255   /* If the number of iterations is known and we do not do versioning, we can
4256      decide whether to vectorize at compile time.  Hence the scalar version
4257      do not carry cost model guard costs.  */
4258   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4259       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
4260     {
4261       /* Cost model check occurs at versioning.  */
4262       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4263         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
4264       else
4265         {
4266           /* Cost model check occurs at prologue generation.  */
4267           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
4268             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
4269               + vect_get_stmt_cost (cond_branch_not_taken);
4270           /* Cost model check occurs at epilogue generation.  */
4271           else
4272             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
4273         }
4274     }
4275
4276   /* Complete the target-specific cost calculations.  */
4277   finish_cost (loop_vinfo->vector_costs, loop_vinfo->scalar_costs,
4278                &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost,
4279                suggested_unroll_factor);
4280
4281   if (suggested_unroll_factor && *suggested_unroll_factor > 1
4282       && LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) != MAX_VECTORIZATION_FACTOR
4283       && !known_le (LOOP_VINFO_VECT_FACTOR (loop_vinfo) *
4284                     *suggested_unroll_factor,
4285                     LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo)))
4286     {
4287       if (dump_enabled_p ())
4288         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4289                          "can't unroll as unrolled vectorization factor larger"
4290                          " than maximum vectorization factor: %d\n",
4291                          LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo));
4292       *suggested_unroll_factor = 1;
4293     }
4294
4295   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
4296
4297   if (dump_enabled_p ())
4298     {
4299       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
4300       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
4301                    vec_inside_cost);
4302       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
4303                    vec_prologue_cost);
4304       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
4305                    vec_epilogue_cost);
4306       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
4307                    scalar_single_iter_cost);
4308       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
4309                    scalar_outside_cost);
4310       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
4311                    vec_outside_cost);
4312       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
4313                    peel_iters_prologue);
4314       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
4315                    peel_iters_epilogue);
4316     }
4317
4318   /* Calculate number of iterations required to make the vector version
4319      profitable, relative to the loop bodies only.  The following condition
4320      must hold true:
4321      SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
4322      where
4323      SIC = scalar iteration cost, VIC = vector iteration cost,
4324      VOC = vector outside cost, VF = vectorization factor,
4325      NPEEL = prologue iterations + epilogue iterations,
4326      SOC = scalar outside cost for run time cost model check.  */
4327
4328   int saving_per_viter = (scalar_single_iter_cost * assumed_vf
4329                           - vec_inside_cost);
4330   if (saving_per_viter <= 0)
4331     {
4332       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
4333         warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
4334                     "vectorization did not happen for a simd loop");
4335
4336       if (dump_enabled_p ())
4337         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4338                          "cost model: the vector iteration cost = %d "
4339                          "divided by the scalar iteration cost = %d "
4340                          "is greater or equal to the vectorization factor = %d"
4341                          ".\n",
4342                          vec_inside_cost, scalar_single_iter_cost, assumed_vf);
4343       *ret_min_profitable_niters = -1;
4344       *ret_min_profitable_estimate = -1;
4345       return;
4346     }
4347
4348   /* ??? The "if" arm is written to handle all cases; see below for what
4349      we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
4350   if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4351     {
4352       /* Rewriting the condition above in terms of the number of
4353          vector iterations (vniters) rather than the number of
4354          scalar iterations (niters) gives:
4355
4356          SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
4357
4358          <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
4359
4360          For integer N, X and Y when X > 0:
4361
4362          N * X > Y <==> N >= (Y /[floor] X) + 1.  */
4363       int outside_overhead = (vec_outside_cost
4364                               - scalar_single_iter_cost * peel_iters_prologue
4365                               - scalar_single_iter_cost * peel_iters_epilogue
4366                               - scalar_outside_cost);
4367       /* We're only interested in cases that require at least one
4368          vector iteration.  */
4369       int min_vec_niters = 1;
4370       if (outside_overhead > 0)
4371         min_vec_niters = outside_overhead / saving_per_viter + 1;
4372
4373       if (dump_enabled_p ())
4374         dump_printf (MSG_NOTE, "  Minimum number of vector iterations: %d\n",
4375                      min_vec_niters);
4376
4377       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4378         {
4379           /* Now that we know the minimum number of vector iterations,
4380              find the minimum niters for which the scalar cost is larger:
4381
4382              SIC * niters > VIC * vniters + VOC - SOC
4383
4384              We know that the minimum niters is no more than
4385              vniters * VF + NPEEL, but it might be (and often is) less
4386              than that if a partial vector iteration is cheaper than the
4387              equivalent scalar code.  */
4388           int threshold = (vec_inside_cost * min_vec_niters
4389                            + vec_outside_cost
4390                            - scalar_outside_cost);
4391           if (threshold <= 0)
4392             min_profitable_iters = 1;
4393           else
4394             min_profitable_iters = threshold / scalar_single_iter_cost + 1;
4395         }
4396       else
4397         /* Convert the number of vector iterations into a number of
4398            scalar iterations.  */
4399         min_profitable_iters = (min_vec_niters * assumed_vf
4400                                 + peel_iters_prologue
4401                                 + peel_iters_epilogue);
4402     }
4403   else
4404     {
4405       min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
4406                               * assumed_vf
4407                               - vec_inside_cost * peel_iters_prologue
4408                               - vec_inside_cost * peel_iters_epilogue);
4409       if (min_profitable_iters <= 0)
4410         min_profitable_iters = 0;
4411       else
4412         {
4413           min_profitable_iters /= saving_per_viter;
4414
4415           if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
4416               <= (((int) vec_inside_cost * min_profitable_iters)
4417                   + (((int) vec_outside_cost - scalar_outside_cost)
4418                      * assumed_vf)))
4419             min_profitable_iters++;
4420         }
4421     }
4422
4423   if (dump_enabled_p ())
4424     dump_printf (MSG_NOTE,
4425                  "  Calculated minimum iters for profitability: %d\n",
4426                  min_profitable_iters);
4427
4428   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
4429       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
4430     /* We want the vectorized loop to execute at least once.  */
4431     min_profitable_iters = assumed_vf + peel_iters_prologue;
4432   else if (min_profitable_iters < peel_iters_prologue)
4433     /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
4434        vectorized loop executes at least once.  */
4435     min_profitable_iters = peel_iters_prologue;
4436
4437   if (dump_enabled_p ())
4438     dump_printf_loc (MSG_NOTE, vect_location,
4439                      "  Runtime profitability threshold = %d\n",
4440                      min_profitable_iters);
4441
4442   *ret_min_profitable_niters = min_profitable_iters;
4443
4444   /* Calculate number of iterations required to make the vector version
4445      profitable, relative to the loop bodies only.
4446
4447      Non-vectorized variant is SIC * niters and it must win over vector
4448      variant on the expected loop trip count.  The following condition must hold true:
4449      SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC  */
4450
4451   if (vec_outside_cost <= 0)
4452     min_profitable_estimate = 0;
4453   /* ??? This "else if" arm is written to handle all cases; see below for
4454      what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
4455   else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4456     {
4457       /* This is a repeat of the code above, but with + SOC rather
4458          than - SOC.  */
4459       int outside_overhead = (vec_outside_cost
4460                               - scalar_single_iter_cost * peel_iters_prologue
4461                               - scalar_single_iter_cost * peel_iters_epilogue
4462                               + scalar_outside_cost);
4463       int min_vec_niters = 1;
4464       if (outside_overhead > 0)
4465         min_vec_niters = outside_overhead / saving_per_viter + 1;
4466
4467       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4468         {
4469           int threshold = (vec_inside_cost * min_vec_niters
4470                            + vec_outside_cost
4471                            + scalar_outside_cost);
4472           min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
4473         }
4474       else
4475         min_profitable_estimate = (min_vec_niters * assumed_vf
4476                                    + peel_iters_prologue
4477                                    + peel_iters_epilogue);
4478     }
4479   else
4480     {
4481       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
4482                                  * assumed_vf
4483                                  - vec_inside_cost * peel_iters_prologue
4484                                  - vec_inside_cost * peel_iters_epilogue)
4485                                  / ((scalar_single_iter_cost * assumed_vf)
4486                                    - vec_inside_cost);
4487     }
4488   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
4489   if (dump_enabled_p ())
4490     dump_printf_loc (MSG_NOTE, vect_location,
4491                      "  Static estimate profitability threshold = %d\n",
4492                      min_profitable_estimate);
4493
4494   *ret_min_profitable_estimate = min_profitable_estimate;
4495 }
4496
4497 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
4498    vector elements (not bits) for a vector with NELT elements.  */
4499 static void
4500 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
4501                               vec_perm_builder *sel)
4502 {
4503   /* The encoding is a single stepped pattern.  Any wrap-around is handled
4504      by vec_perm_indices.  */
4505   sel->new_vector (nelt, 1, 3);
4506   for (unsigned int i = 0; i < 3; i++)
4507     sel->quick_push (i + offset);
4508 }
4509
4510 /* Checks whether the target supports whole-vector shifts for vectors of mode
4511    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
4512    it supports vec_perm_const with masks for all necessary shift amounts.  */
4513 static bool
4514 have_whole_vector_shift (machine_mode mode)
4515 {
4516   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
4517     return true;
4518
4519   /* Variable-length vectors should be handled via the optab.  */
4520   unsigned int nelt;
4521   if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
4522     return false;
4523
4524   vec_perm_builder sel;
4525   vec_perm_indices indices;
4526   for (unsigned int i = nelt / 2; i >= 1; i /= 2)
4527     {
4528       calc_vec_perm_mask_for_shift (i, nelt, &sel);
4529       indices.new_vector (sel, 2, nelt);
4530       if (!can_vec_perm_const_p (mode, indices, false))
4531         return false;
4532     }
4533   return true;
4534 }
4535
4536 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
4537    functions. Design better to avoid maintenance issues.  */
4538
4539 /* Function vect_model_reduction_cost.
4540
4541    Models cost for a reduction operation, including the vector ops
4542    generated within the strip-mine loop in some cases, the initial
4543    definition before the loop, and the epilogue code that must be generated.  */
4544
4545 static void
4546 vect_model_reduction_cost (loop_vec_info loop_vinfo,
4547                            stmt_vec_info stmt_info, internal_fn reduc_fn,
4548                            vect_reduction_type reduction_type,
4549                            int ncopies, stmt_vector_for_cost *cost_vec)
4550 {
4551   int prologue_cost = 0, epilogue_cost = 0, inside_cost = 0;
4552   tree vectype;
4553   machine_mode mode;
4554   class loop *loop = NULL;
4555
4556   if (loop_vinfo)
4557     loop = LOOP_VINFO_LOOP (loop_vinfo);
4558
4559   /* Condition reductions generate two reductions in the loop.  */
4560   if (reduction_type == COND_REDUCTION)
4561     ncopies *= 2;
4562
4563   vectype = STMT_VINFO_VECTYPE (stmt_info);
4564   mode = TYPE_MODE (vectype);
4565   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4566
4567   gimple_match_op op;
4568   if (!gimple_extract_op (orig_stmt_info->stmt, &op))
4569     gcc_unreachable ();
4570
4571   if (reduction_type == EXTRACT_LAST_REDUCTION)
4572     /* No extra instructions are needed in the prologue.  The loop body
4573        operations are costed in vectorizable_condition.  */
4574     inside_cost = 0;
4575   else if (reduction_type == FOLD_LEFT_REDUCTION)
4576     {
4577       /* No extra instructions needed in the prologue.  */
4578       prologue_cost = 0;
4579
4580       if (reduc_fn != IFN_LAST)
4581         /* Count one reduction-like operation per vector.  */
4582         inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
4583                                         stmt_info, 0, vect_body);
4584       else
4585         {
4586           /* Use NELEMENTS extracts and NELEMENTS scalar ops.  */
4587           unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
4588           inside_cost = record_stmt_cost (cost_vec, nelements,
4589                                           vec_to_scalar, stmt_info, 0,
4590                                           vect_body);
4591           inside_cost += record_stmt_cost (cost_vec, nelements,
4592                                            scalar_stmt, stmt_info, 0,
4593                                            vect_body);
4594         }
4595     }
4596   else
4597     {
4598       /* Add in cost for initial definition.
4599          For cond reduction we have four vectors: initial index, step,
4600          initial result of the data reduction, initial value of the index
4601          reduction.  */
4602       int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
4603       prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
4604                                          scalar_to_vec, stmt_info, 0,
4605                                          vect_prologue);
4606     }
4607
4608   /* Determine cost of epilogue code.
4609
4610      We have a reduction operator that will reduce the vector in one statement.
4611      Also requires scalar extract.  */
4612
4613   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
4614     {
4615       if (reduc_fn != IFN_LAST)
4616         {
4617           if (reduction_type == COND_REDUCTION)
4618             {
4619               /* An EQ stmt and an COND_EXPR stmt.  */
4620               epilogue_cost += record_stmt_cost (cost_vec, 2,
4621                                                  vector_stmt, stmt_info, 0,
4622                                                  vect_epilogue);
4623               /* Reduction of the max index and a reduction of the found
4624                  values.  */
4625               epilogue_cost += record_stmt_cost (cost_vec, 2,
4626                                                  vec_to_scalar, stmt_info, 0,
4627                                                  vect_epilogue);
4628               /* A broadcast of the max value.  */
4629               epilogue_cost += record_stmt_cost (cost_vec, 1,
4630                                                  scalar_to_vec, stmt_info, 0,
4631                                                  vect_epilogue);
4632             }
4633           else
4634             {
4635               epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
4636                                                  stmt_info, 0, vect_epilogue);
4637               epilogue_cost += record_stmt_cost (cost_vec, 1,
4638                                                  vec_to_scalar, stmt_info, 0,
4639                                                  vect_epilogue);
4640             }
4641         }
4642       else if (reduction_type == COND_REDUCTION)
4643         {
4644           unsigned estimated_nunits = vect_nunits_for_cost (vectype);
4645           /* Extraction of scalar elements.  */
4646           epilogue_cost += record_stmt_cost (cost_vec,
4647                                              2 * estimated_nunits,
4648                                              vec_to_scalar, stmt_info, 0,
4649                                              vect_epilogue);
4650           /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
4651           epilogue_cost += record_stmt_cost (cost_vec,
4652                                              2 * estimated_nunits - 3,
4653                                              scalar_stmt, stmt_info, 0,
4654                                              vect_epilogue);
4655         }
4656       else if (reduction_type == EXTRACT_LAST_REDUCTION
4657                || reduction_type == FOLD_LEFT_REDUCTION)
4658         /* No extra instructions need in the epilogue.  */
4659         ;
4660       else
4661         {
4662           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4663           tree bitsize = TYPE_SIZE (op.type);
4664           int element_bitsize = tree_to_uhwi (bitsize);
4665           int nelements = vec_size_in_bits / element_bitsize;
4666
4667           if (op.code == COND_EXPR)
4668             op.code = MAX_EXPR;
4669
4670           /* We have a whole vector shift available.  */
4671           if (VECTOR_MODE_P (mode)
4672               && directly_supported_p (op.code, vectype)
4673               && have_whole_vector_shift (mode))
4674             {
4675               /* Final reduction via vector shifts and the reduction operator.
4676                  Also requires scalar extract.  */
4677               epilogue_cost += record_stmt_cost (cost_vec,
4678                                                  exact_log2 (nelements) * 2,
4679                                                  vector_stmt, stmt_info, 0,
4680                                                  vect_epilogue);
4681               epilogue_cost += record_stmt_cost (cost_vec, 1,
4682                                                  vec_to_scalar, stmt_info, 0,
4683                                                  vect_epilogue);
4684             }
4685           else
4686             /* Use extracts and reduction op for final reduction.  For N
4687                elements, we have N extracts and N-1 reduction ops.  */
4688             epilogue_cost += record_stmt_cost (cost_vec,
4689                                                nelements + nelements - 1,
4690                                                vector_stmt, stmt_info, 0,
4691                                                vect_epilogue);
4692         }
4693     }
4694
4695   if (dump_enabled_p ())
4696     dump_printf (MSG_NOTE,
4697                  "vect_model_reduction_cost: inside_cost = %d, "
4698                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4699                  prologue_cost, epilogue_cost);
4700 }
4701
4702 /* SEQ is a sequence of instructions that initialize the reduction
4703    described by REDUC_INFO.  Emit them in the appropriate place.  */
4704
4705 static void
4706 vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo,
4707                                 stmt_vec_info reduc_info, gimple *seq)
4708 {
4709   if (reduc_info->reused_accumulator)
4710     {
4711       /* When reusing an accumulator from the main loop, we only need
4712          initialization instructions if the main loop can be skipped.
4713          In that case, emit the initialization instructions at the end
4714          of the guard block that does the skip.  */
4715       edge skip_edge = loop_vinfo->skip_main_loop_edge;
4716       gcc_assert (skip_edge);
4717       gimple_stmt_iterator gsi = gsi_last_bb (skip_edge->src);
4718       gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
4719     }
4720   else
4721     {
4722       /* The normal case: emit the initialization instructions on the
4723          preheader edge.  */
4724       class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4725       gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), seq);
4726     }
4727 }
4728
4729 /* Function get_initial_def_for_reduction
4730
4731    Input:
4732    REDUC_INFO - the info_for_reduction
4733    INIT_VAL - the initial value of the reduction variable
4734    NEUTRAL_OP - a value that has no effect on the reduction, as per
4735                 neutral_op_for_reduction
4736
4737    Output:
4738    Return a vector variable, initialized according to the operation that
4739         STMT_VINFO performs. This vector will be used as the initial value
4740         of the vector of partial results.
4741
4742    The value we need is a vector in which element 0 has value INIT_VAL
4743    and every other element has value NEUTRAL_OP.  */
4744
4745 static tree
4746 get_initial_def_for_reduction (loop_vec_info loop_vinfo,
4747                                stmt_vec_info reduc_info,
4748                                tree init_val, tree neutral_op)
4749 {
4750   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4751   tree scalar_type = TREE_TYPE (init_val);
4752   tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
4753   tree init_def;
4754   gimple_seq stmts = NULL;
4755
4756   gcc_assert (vectype);
4757
4758   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4759               || SCALAR_FLOAT_TYPE_P (scalar_type));
4760
4761   gcc_assert (nested_in_vect_loop_p (loop, reduc_info)
4762               || loop == (gimple_bb (reduc_info->stmt))->loop_father);
4763
4764   if (operand_equal_p (init_val, neutral_op))
4765     {
4766       /* If both elements are equal then the vector described above is
4767          just a splat.  */
4768       neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
4769       init_def = gimple_build_vector_from_val (&stmts, vectype, neutral_op);
4770     }
4771   else
4772     {
4773       neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
4774       init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4775       if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4776         {
4777           /* Construct a splat of NEUTRAL_OP and insert INIT_VAL into
4778              element 0.  */
4779           init_def = gimple_build_vector_from_val (&stmts, vectype,
4780                                                    neutral_op);
4781           init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4782                                    vectype, init_def, init_val);
4783         }
4784       else
4785         {
4786           /* Build {INIT_VAL, NEUTRAL_OP, NEUTRAL_OP, ...}.  */
4787           tree_vector_builder elts (vectype, 1, 2);
4788           elts.quick_push (init_val);
4789           elts.quick_push (neutral_op);
4790           init_def = gimple_build_vector (&stmts, &elts);
4791         }
4792     }
4793
4794   if (stmts)
4795     vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, stmts);
4796   return init_def;
4797 }
4798
4799 /* Get at the initial defs for the reduction PHIs for REDUC_INFO,
4800    which performs a reduction involving GROUP_SIZE scalar statements.
4801    NUMBER_OF_VECTORS is the number of vector defs to create.  If NEUTRAL_OP
4802    is nonnull, introducing extra elements of that value will not change the
4803    result.  */
4804
4805 static void
4806 get_initial_defs_for_reduction (loop_vec_info loop_vinfo,
4807                                 stmt_vec_info reduc_info,
4808                                 vec<tree> *vec_oprnds,
4809                                 unsigned int number_of_vectors,
4810                                 unsigned int group_size, tree neutral_op)
4811 {
4812   vec<tree> &initial_values = reduc_info->reduc_initial_values;
4813   unsigned HOST_WIDE_INT nunits;
4814   unsigned j, number_of_places_left_in_vector;
4815   tree vector_type = STMT_VINFO_VECTYPE (reduc_info);
4816   unsigned int i;
4817
4818   gcc_assert (group_size == initial_values.length () || neutral_op);
4819
4820   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4821      created vectors. It is greater than 1 if unrolling is performed.
4822
4823      For example, we have two scalar operands, s1 and s2 (e.g., group of
4824      strided accesses of size two), while NUNITS is four (i.e., four scalars
4825      of this type can be packed in a vector).  The output vector will contain
4826      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
4827      will be 2).
4828
4829      If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4830      vectors containing the operands.
4831
4832      For example, NUNITS is four as before, and the group size is 8
4833      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
4834      {s5, s6, s7, s8}.  */
4835
4836   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4837     nunits = group_size;
4838
4839   number_of_places_left_in_vector = nunits;
4840   bool constant_p = true;
4841   tree_vector_builder elts (vector_type, nunits, 1);
4842   elts.quick_grow (nunits);
4843   gimple_seq ctor_seq = NULL;
4844   for (j = 0; j < nunits * number_of_vectors; ++j)
4845     {
4846       tree op;
4847       i = j % group_size;
4848
4849       /* Get the def before the loop.  In reduction chain we have only
4850          one initial value.  Else we have as many as PHIs in the group.  */
4851       if (i >= initial_values.length () || (j > i && neutral_op))
4852         op = neutral_op;
4853       else
4854         op = initial_values[i];
4855
4856       /* Create 'vect_ = {op0,op1,...,opn}'.  */
4857       number_of_places_left_in_vector--;
4858       elts[nunits - number_of_places_left_in_vector - 1] = op;
4859       if (!CONSTANT_CLASS_P (op))
4860         constant_p = false;
4861
4862       if (number_of_places_left_in_vector == 0)
4863         {
4864           tree init;
4865           if (constant_p && !neutral_op
4866               ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4867               : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4868             /* Build the vector directly from ELTS.  */
4869             init = gimple_build_vector (&ctor_seq, &elts);
4870           else if (neutral_op)
4871             {
4872               /* Build a vector of the neutral value and shift the
4873                  other elements into place.  */
4874               init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4875                                                    neutral_op);
4876               int k = nunits;
4877               while (k > 0 && elts[k - 1] == neutral_op)
4878                 k -= 1;
4879               while (k > 0)
4880                 {
4881                   k -= 1;
4882                   init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4883                                        vector_type, init, elts[k]);
4884                 }
4885             }
4886           else
4887             {
4888               /* First time round, duplicate ELTS to fill the
4889                  required number of vectors.  */
4890               duplicate_and_interleave (loop_vinfo, &ctor_seq, vector_type,
4891                                         elts, number_of_vectors, *vec_oprnds);
4892               break;
4893             }
4894           vec_oprnds->quick_push (init);
4895
4896           number_of_places_left_in_vector = nunits;
4897           elts.new_vector (vector_type, nunits, 1);
4898           elts.quick_grow (nunits);
4899           constant_p = true;
4900         }
4901     }
4902   if (ctor_seq != NULL)
4903     vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, ctor_seq);
4904 }
4905
4906 /* For a statement STMT_INFO taking part in a reduction operation return
4907    the stmt_vec_info the meta information is stored on.  */
4908
4909 stmt_vec_info
4910 info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
4911 {
4912   stmt_info = vect_orig_stmt (stmt_info);
4913   gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
4914   if (!is_a <gphi *> (stmt_info->stmt)
4915       || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
4916     stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4917   gphi *phi = as_a <gphi *> (stmt_info->stmt);
4918   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
4919     {
4920       if (gimple_phi_num_args (phi) == 1)
4921         stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4922     }
4923   else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
4924     {
4925       stmt_vec_info info = vinfo->lookup_def (vect_phi_initial_value (phi));
4926       if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
4927         stmt_info = info;
4928     }
4929   return stmt_info;
4930 }
4931
4932 /* See if LOOP_VINFO is an epilogue loop whose main loop had a reduction that
4933    REDUC_INFO can build on.  Adjust REDUC_INFO and return true if so, otherwise
4934    return false.  */
4935
4936 static bool
4937 vect_find_reusable_accumulator (loop_vec_info loop_vinfo,
4938                                 stmt_vec_info reduc_info)
4939 {
4940   loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
4941   if (!main_loop_vinfo)
4942     return false;
4943
4944   if (STMT_VINFO_REDUC_TYPE (reduc_info) != TREE_CODE_REDUCTION)
4945     return false;
4946
4947   unsigned int num_phis = reduc_info->reduc_initial_values.length ();
4948   auto_vec<tree, 16> main_loop_results (num_phis);
4949   auto_vec<tree, 16> initial_values (num_phis);
4950   if (edge main_loop_edge = loop_vinfo->main_loop_edge)
4951     {
4952       /* The epilogue loop can be entered either from the main loop or
4953          from an earlier guard block.  */
4954       edge skip_edge = loop_vinfo->skip_main_loop_edge;
4955       for (tree incoming_value : reduc_info->reduc_initial_values)
4956         {
4957           /* Look for:
4958
4959                INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop),
4960                                     INITIAL_VALUE(guard block)>.  */
4961           gcc_assert (TREE_CODE (incoming_value) == SSA_NAME);
4962
4963           gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value));
4964           gcc_assert (gimple_bb (phi) == main_loop_edge->dest);
4965
4966           tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge);
4967           tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge);
4968
4969           main_loop_results.quick_push (from_main_loop);
4970           initial_values.quick_push (from_skip);
4971         }
4972     }
4973   else
4974     /* The main loop dominates the epilogue loop.  */
4975     main_loop_results.splice (reduc_info->reduc_initial_values);
4976
4977   /* See if the main loop has the kind of accumulator we need.  */
4978   vect_reusable_accumulator *accumulator
4979     = main_loop_vinfo->reusable_accumulators.get (main_loop_results[0]);
4980   if (!accumulator
4981       || num_phis != accumulator->reduc_info->reduc_scalar_results.length ()
4982       || !std::equal (main_loop_results.begin (), main_loop_results.end (),
4983                       accumulator->reduc_info->reduc_scalar_results.begin ()))
4984     return false;
4985
4986   /* Handle the case where we can reduce wider vectors to narrower ones.  */
4987   tree vectype = STMT_VINFO_VECTYPE (reduc_info);
4988   tree old_vectype = TREE_TYPE (accumulator->reduc_input);
4989   unsigned HOST_WIDE_INT m;
4990   if (!constant_multiple_p (TYPE_VECTOR_SUBPARTS (old_vectype),
4991                             TYPE_VECTOR_SUBPARTS (vectype), &m))
4992     return false;
4993   /* Check the intermediate vector types and operations are available.  */
4994   tree prev_vectype = old_vectype;
4995   poly_uint64 intermediate_nunits = TYPE_VECTOR_SUBPARTS (old_vectype);
4996   while (known_gt (intermediate_nunits, TYPE_VECTOR_SUBPARTS (vectype)))
4997     {
4998       intermediate_nunits = exact_div (intermediate_nunits, 2);
4999       tree intermediate_vectype = get_related_vectype_for_scalar_type
5000         (TYPE_MODE (vectype), TREE_TYPE (vectype), intermediate_nunits);
5001       if (!intermediate_vectype
5002           || !directly_supported_p (STMT_VINFO_REDUC_CODE (reduc_info),
5003                                     intermediate_vectype)
5004           || !can_vec_extract (TYPE_MODE (prev_vectype),
5005                                TYPE_MODE (intermediate_vectype)))
5006         return false;
5007       prev_vectype = intermediate_vectype;
5008     }
5009
5010   /* Non-SLP reductions might apply an adjustment after the reduction
5011      operation, in order to simplify the initialization of the accumulator.
5012      If the epilogue loop carries on from where the main loop left off,
5013      it should apply the same adjustment to the final reduction result.
5014
5015      If the epilogue loop can also be entered directly (rather than via
5016      the main loop), we need to be able to handle that case in the same way,
5017      with the same adjustment.  (In principle we could add a PHI node
5018      to select the correct adjustment, but in practice that shouldn't be
5019      necessary.)  */
5020   tree main_adjustment
5021     = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (accumulator->reduc_info);
5022   if (loop_vinfo->main_loop_edge && main_adjustment)
5023     {
5024       gcc_assert (num_phis == 1);
5025       tree initial_value = initial_values[0];
5026       /* Check that we can use INITIAL_VALUE as the adjustment and
5027          initialize the accumulator with a neutral value instead.  */
5028       if (!operand_equal_p (initial_value, main_adjustment))
5029         return false;
5030       code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5031       initial_values[0] = neutral_op_for_reduction (TREE_TYPE (initial_value),
5032                                                     code, initial_value);
5033     }
5034   STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = main_adjustment;
5035   reduc_info->reduc_initial_values.truncate (0);
5036   reduc_info->reduc_initial_values.splice (initial_values);
5037   reduc_info->reused_accumulator = accumulator;
5038   return true;
5039 }
5040
5041 /* Reduce the vector VEC_DEF down to VECTYPE with reduction operation
5042    CODE emitting stmts before GSI.  Returns a vector def of VECTYPE.  */
5043
5044 static tree
5045 vect_create_partial_epilog (tree vec_def, tree vectype, code_helper code,
5046                             gimple_seq *seq)
5047 {
5048   unsigned nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (vec_def)).to_constant ();
5049   unsigned nunits1 = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5050   tree stype = TREE_TYPE (vectype);
5051   tree new_temp = vec_def;
5052   while (nunits > nunits1)
5053     {
5054       nunits /= 2;
5055       tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5056                                                            stype, nunits);
5057       unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5058
5059       /* The target has to make sure we support lowpart/highpart
5060          extraction, either via direct vector extract or through
5061          an integer mode punning.  */
5062       tree dst1, dst2;
5063       gimple *epilog_stmt;
5064       if (convert_optab_handler (vec_extract_optab,
5065                                  TYPE_MODE (TREE_TYPE (new_temp)),
5066                                  TYPE_MODE (vectype1))
5067           != CODE_FOR_nothing)
5068         {
5069           /* Extract sub-vectors directly once vec_extract becomes
5070              a conversion optab.  */
5071           dst1 = make_ssa_name (vectype1);
5072           epilog_stmt
5073               = gimple_build_assign (dst1, BIT_FIELD_REF,
5074                                      build3 (BIT_FIELD_REF, vectype1,
5075                                              new_temp, TYPE_SIZE (vectype1),
5076                                              bitsize_int (0)));
5077           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5078           dst2 =  make_ssa_name (vectype1);
5079           epilog_stmt
5080               = gimple_build_assign (dst2, BIT_FIELD_REF,
5081                                      build3 (BIT_FIELD_REF, vectype1,
5082                                              new_temp, TYPE_SIZE (vectype1),
5083                                              bitsize_int (bitsize)));
5084           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5085         }
5086       else
5087         {
5088           /* Extract via punning to appropriately sized integer mode
5089              vector.  */
5090           tree eltype = build_nonstandard_integer_type (bitsize, 1);
5091           tree etype = build_vector_type (eltype, 2);
5092           gcc_assert (convert_optab_handler (vec_extract_optab,
5093                                              TYPE_MODE (etype),
5094                                              TYPE_MODE (eltype))
5095                       != CODE_FOR_nothing);
5096           tree tem = make_ssa_name (etype);
5097           epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5098                                              build1 (VIEW_CONVERT_EXPR,
5099                                                      etype, new_temp));
5100           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5101           new_temp = tem;
5102           tem = make_ssa_name (eltype);
5103           epilog_stmt
5104               = gimple_build_assign (tem, BIT_FIELD_REF,
5105                                      build3 (BIT_FIELD_REF, eltype,
5106                                              new_temp, TYPE_SIZE (eltype),
5107                                              bitsize_int (0)));
5108           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5109           dst1 = make_ssa_name (vectype1);
5110           epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5111                                              build1 (VIEW_CONVERT_EXPR,
5112                                                      vectype1, tem));
5113           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5114           tem = make_ssa_name (eltype);
5115           epilog_stmt
5116               = gimple_build_assign (tem, BIT_FIELD_REF,
5117                                      build3 (BIT_FIELD_REF, eltype,
5118                                              new_temp, TYPE_SIZE (eltype),
5119                                              bitsize_int (bitsize)));
5120           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5121           dst2 =  make_ssa_name (vectype1);
5122           epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5123                                              build1 (VIEW_CONVERT_EXPR,
5124                                                      vectype1, tem));
5125           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5126         }
5127
5128       new_temp = gimple_build (seq, code, vectype1, dst1, dst2);
5129     }
5130
5131   return new_temp;
5132 }
5133
5134 /* Function vect_create_epilog_for_reduction
5135
5136    Create code at the loop-epilog to finalize the result of a reduction
5137    computation.
5138
5139    STMT_INFO is the scalar reduction stmt that is being vectorized.
5140    SLP_NODE is an SLP node containing a group of reduction statements. The
5141      first one in this group is STMT_INFO.
5142    SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
5143    REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
5144      (counting from 0)
5145
5146    This function:
5147    1. Completes the reduction def-use cycles.
5148    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
5149       by calling the function specified by REDUC_FN if available, or by
5150       other means (whole-vector shifts or a scalar loop).
5151       The function also creates a new phi node at the loop exit to preserve
5152       loop-closed form, as illustrated below.
5153
5154      The flow at the entry to this function:
5155
5156         loop:
5157           vec_def = phi <vec_init, null>        # REDUCTION_PHI
5158           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
5159           s_loop = scalar_stmt                  # (scalar) STMT_INFO
5160         loop_exit:
5161           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5162           use <s_out0>
5163           use <s_out0>
5164
5165      The above is transformed by this function into:
5166
5167         loop:
5168           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
5169           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
5170           s_loop = scalar_stmt                  # (scalar) STMT_INFO
5171         loop_exit:
5172           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5173           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5174           v_out2 = reduce <v_out1>
5175           s_out3 = extract_field <v_out2, 0>
5176           s_out4 = adjust_result <s_out3>
5177           use <s_out4>
5178           use <s_out4>
5179 */
5180
5181 static void
5182 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
5183                                   stmt_vec_info stmt_info,
5184                                   slp_tree slp_node,
5185                                   slp_instance slp_node_instance)
5186 {
5187   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5188   gcc_assert (reduc_info->is_reduc_info);
5189   /* For double reductions we need to get at the inner loop reduction
5190      stmt which has the meta info attached.  Our stmt_info is that of the
5191      loop-closed PHI of the inner loop which we remember as
5192      def for the reduction PHI generation.  */
5193   bool double_reduc = false;
5194   stmt_vec_info rdef_info = stmt_info;
5195   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5196     {
5197       gcc_assert (!slp_node);
5198       double_reduc = true;
5199       stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
5200                                             (stmt_info->stmt, 0));
5201       stmt_info = vect_stmt_to_vectorize (stmt_info);
5202     }
5203   gphi *reduc_def_stmt
5204     = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
5205   code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5206   internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
5207   tree vectype;
5208   machine_mode mode;
5209   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
5210   basic_block exit_bb;
5211   tree scalar_dest;
5212   tree scalar_type;
5213   gimple *new_phi = NULL, *phi;
5214   gimple_stmt_iterator exit_gsi;
5215   tree new_temp = NULL_TREE, new_name, new_scalar_dest;
5216   gimple *epilog_stmt = NULL;
5217   gimple *exit_phi;
5218   tree bitsize;
5219   tree def;
5220   tree orig_name, scalar_result;
5221   imm_use_iterator imm_iter, phi_imm_iter;
5222   use_operand_p use_p, phi_use_p;
5223   gimple *use_stmt;
5224   auto_vec<tree> reduc_inputs;
5225   int j, i;
5226   vec<tree> &scalar_results = reduc_info->reduc_scalar_results;
5227   unsigned int group_size = 1, k;
5228   auto_vec<gimple *> phis;
5229   /* SLP reduction without reduction chain, e.g.,
5230      # a1 = phi <a2, a0>
5231      # b1 = phi <b2, b0>
5232      a2 = operation (a1)
5233      b2 = operation (b1)  */
5234   bool slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
5235   bool direct_slp_reduc;
5236   tree induction_index = NULL_TREE;
5237
5238   if (slp_node)
5239     group_size = SLP_TREE_LANES (slp_node);
5240
5241   if (nested_in_vect_loop_p (loop, stmt_info))
5242     {
5243       outer_loop = loop;
5244       loop = loop->inner;
5245       gcc_assert (!slp_node && double_reduc);
5246     }
5247
5248   vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
5249   gcc_assert (vectype);
5250   mode = TYPE_MODE (vectype);
5251
5252   tree induc_val = NULL_TREE;
5253   tree adjustment_def = NULL;
5254   if (slp_node)
5255     ;
5256   else
5257     {
5258       /* Optimize: for induction condition reduction, if we can't use zero
5259          for induc_val, use initial_def.  */
5260       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5261         induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
5262       else if (double_reduc)
5263         ;
5264       else
5265         adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
5266     }
5267
5268   stmt_vec_info single_live_out_stmt[] = { stmt_info };
5269   array_slice<const stmt_vec_info> live_out_stmts = single_live_out_stmt;
5270   if (slp_reduc)
5271     /* All statements produce live-out values.  */
5272     live_out_stmts = SLP_TREE_SCALAR_STMTS (slp_node);
5273   else if (slp_node)
5274     {
5275       /* The last statement in the reduction chain produces the live-out
5276          value.  Note SLP optimization can shuffle scalar stmts to
5277          optimize permutations so we have to search for the last stmt.  */
5278       for (k = 0; k < group_size; ++k)
5279         if (!REDUC_GROUP_NEXT_ELEMENT (SLP_TREE_SCALAR_STMTS (slp_node)[k]))
5280           {
5281             single_live_out_stmt[0] = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5282             break;
5283           }
5284     }
5285
5286   unsigned vec_num;
5287   int ncopies;
5288   if (slp_node)
5289     {
5290       vec_num = SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis).length ();
5291       ncopies = 1;
5292     }
5293   else
5294     {
5295       stmt_vec_info reduc_info = loop_vinfo->lookup_stmt (reduc_def_stmt);
5296       vec_num = 1;
5297       ncopies = STMT_VINFO_VEC_STMTS (reduc_info).length ();
5298     }
5299
5300   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
5301      which is updated with the current index of the loop for every match of
5302      the original loop's cond_expr (VEC_STMT).  This results in a vector
5303      containing the last time the condition passed for that vector lane.
5304      The first match will be a 1 to allow 0 to be used for non-matching
5305      indexes.  If there are no matches at all then the vector will be all
5306      zeroes.
5307
5308      PR92772: This algorithm is broken for architectures that support
5309      masked vectors, but do not provide fold_extract_last.  */
5310   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
5311     {
5312       auto_vec<std::pair<tree, bool>, 2> ccompares;
5313       stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
5314       cond_info = vect_stmt_to_vectorize (cond_info);
5315       while (cond_info != reduc_info)
5316         {
5317           if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
5318             {
5319               gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
5320               gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
5321               ccompares.safe_push
5322                 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
5323                                  STMT_VINFO_REDUC_IDX (cond_info) == 2));
5324             }
5325           cond_info
5326             = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
5327                                                  1 + STMT_VINFO_REDUC_IDX
5328                                                         (cond_info)));
5329           cond_info = vect_stmt_to_vectorize (cond_info);
5330         }
5331       gcc_assert (ccompares.length () != 0);
5332
5333       tree indx_before_incr, indx_after_incr;
5334       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
5335       int scalar_precision
5336         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
5337       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
5338       tree cr_index_vector_type = get_related_vectype_for_scalar_type
5339         (TYPE_MODE (vectype), cr_index_scalar_type,
5340          TYPE_VECTOR_SUBPARTS (vectype));
5341
5342       /* First we create a simple vector induction variable which starts
5343          with the values {1,2,3,...} (SERIES_VECT) and increments by the
5344          vector size (STEP).  */
5345
5346       /* Create a {1,2,3,...} vector.  */
5347       tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
5348
5349       /* Create a vector of the step value.  */
5350       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
5351       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
5352
5353       /* Create an induction variable.  */
5354       gimple_stmt_iterator incr_gsi;
5355       bool insert_after;
5356       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
5357       create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
5358                  insert_after, &indx_before_incr, &indx_after_incr);
5359
5360       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
5361          filled with zeros (VEC_ZERO).  */
5362
5363       /* Create a vector of 0s.  */
5364       tree zero = build_zero_cst (cr_index_scalar_type);
5365       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
5366
5367       /* Create a vector phi node.  */
5368       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
5369       new_phi = create_phi_node (new_phi_tree, loop->header);
5370       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
5371                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
5372
5373       /* Now take the condition from the loops original cond_exprs
5374          and produce a new cond_exprs (INDEX_COND_EXPR) which for
5375          every match uses values from the induction variable
5376          (INDEX_BEFORE_INCR) otherwise uses values from the phi node
5377          (NEW_PHI_TREE).
5378          Finally, we update the phi (NEW_PHI_TREE) to take the value of
5379          the new cond_expr (INDEX_COND_EXPR).  */
5380       gimple_seq stmts = NULL;
5381       for (int i = ccompares.length () - 1; i != -1; --i)
5382         {
5383           tree ccompare = ccompares[i].first;
5384           if (ccompares[i].second)
5385             new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5386                                          cr_index_vector_type,
5387                                          ccompare,
5388                                          indx_before_incr, new_phi_tree);
5389           else
5390             new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5391                                          cr_index_vector_type,
5392                                          ccompare,
5393                                          new_phi_tree, indx_before_incr);
5394         }
5395       gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
5396
5397       /* Update the phi with the vec cond.  */
5398       induction_index = new_phi_tree;
5399       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
5400                    loop_latch_edge (loop), UNKNOWN_LOCATION);
5401     }
5402
5403   /* 2. Create epilog code.
5404         The reduction epilog code operates across the elements of the vector
5405         of partial results computed by the vectorized loop.
5406         The reduction epilog code consists of:
5407
5408         step 1: compute the scalar result in a vector (v_out2)
5409         step 2: extract the scalar result (s_out3) from the vector (v_out2)
5410         step 3: adjust the scalar result (s_out3) if needed.
5411
5412         Step 1 can be accomplished using one the following three schemes:
5413           (scheme 1) using reduc_fn, if available.
5414           (scheme 2) using whole-vector shifts, if available.
5415           (scheme 3) using a scalar loop. In this case steps 1+2 above are
5416                      combined.
5417
5418           The overall epilog code looks like this:
5419
5420           s_out0 = phi <s_loop>         # original EXIT_PHI
5421           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
5422           v_out2 = reduce <v_out1>              # step 1
5423           s_out3 = extract_field <v_out2, 0>    # step 2
5424           s_out4 = adjust_result <s_out3>       # step 3
5425
5426           (step 3 is optional, and steps 1 and 2 may be combined).
5427           Lastly, the uses of s_out0 are replaced by s_out4.  */
5428
5429
5430   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
5431          v_out1 = phi <VECT_DEF>
5432          Store them in NEW_PHIS.  */
5433   if (double_reduc)
5434     loop = outer_loop;
5435   exit_bb = single_exit (loop)->dest;
5436   exit_gsi = gsi_after_labels (exit_bb);
5437   reduc_inputs.create (slp_node ? vec_num : ncopies);
5438   for (unsigned i = 0; i < vec_num; i++)
5439     {
5440       gimple_seq stmts = NULL;
5441       if (slp_node)
5442         def = vect_get_slp_vect_def (slp_node, i);
5443       else
5444         def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]);
5445       for (j = 0; j < ncopies; j++)
5446         {
5447           tree new_def = copy_ssa_name (def);
5448           phi = create_phi_node (new_def, exit_bb);
5449           if (j)
5450             def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
5451           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
5452           new_def = gimple_convert (&stmts, vectype, new_def);
5453           reduc_inputs.quick_push (new_def);
5454         }
5455       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5456     }
5457
5458   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
5459          (i.e. when reduc_fn is not available) and in the final adjustment
5460          code (if needed).  Also get the original scalar reduction variable as
5461          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
5462          represents a reduction pattern), the tree-code and scalar-def are
5463          taken from the original stmt that the pattern-stmt (STMT) replaces.
5464          Otherwise (it is a regular reduction) - the tree-code and scalar-def
5465          are taken from STMT.  */
5466
5467   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
5468   if (orig_stmt_info != stmt_info)
5469     {
5470       /* Reduction pattern  */
5471       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
5472       gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
5473     }
5474
5475   scalar_dest = gimple_get_lhs (orig_stmt_info->stmt);
5476   scalar_type = TREE_TYPE (scalar_dest);
5477   scalar_results.truncate (0);
5478   scalar_results.reserve_exact (group_size);
5479   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
5480   bitsize = TYPE_SIZE (scalar_type);
5481
5482   /* True if we should implement SLP_REDUC using native reduction operations
5483      instead of scalar operations.  */
5484   direct_slp_reduc = (reduc_fn != IFN_LAST
5485                       && slp_reduc
5486                       && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
5487
5488   /* In case of reduction chain, e.g.,
5489      # a1 = phi <a3, a0>
5490      a2 = operation (a1)
5491      a3 = operation (a2),
5492
5493      we may end up with more than one vector result.  Here we reduce them
5494      to one vector.
5495
5496      The same is true if we couldn't use a single defuse cycle.  */
5497   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)
5498       || direct_slp_reduc
5499       || ncopies > 1)
5500     {
5501       gimple_seq stmts = NULL;
5502       tree single_input = reduc_inputs[0];
5503       for (k = 1; k < reduc_inputs.length (); k++)
5504         single_input = gimple_build (&stmts, code, vectype,
5505                                      single_input, reduc_inputs[k]);
5506       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5507
5508       reduc_inputs.truncate (0);
5509       reduc_inputs.safe_push (single_input);
5510     }
5511
5512   tree orig_reduc_input = reduc_inputs[0];
5513
5514   /* If this loop is an epilogue loop that can be skipped after the
5515      main loop, we can only share a reduction operation between the
5516      main loop and the epilogue if we put it at the target of the
5517      skip edge.
5518
5519      We can still reuse accumulators if this check fails.  Doing so has
5520      the minor(?) benefit of making the epilogue loop's scalar result
5521      independent of the main loop's scalar result.  */
5522   bool unify_with_main_loop_p = false;
5523   if (reduc_info->reused_accumulator
5524       && loop_vinfo->skip_this_loop_edge
5525       && single_succ_p (exit_bb)
5526       && single_succ (exit_bb) == loop_vinfo->skip_this_loop_edge->dest)
5527     {
5528       unify_with_main_loop_p = true;
5529
5530       basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest;
5531       reduc_inputs[0] = make_ssa_name (vectype);
5532       gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block);
5533       add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (exit_bb),
5534                    UNKNOWN_LOCATION);
5535       add_phi_arg (new_phi, reduc_info->reused_accumulator->reduc_input,
5536                    loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION);
5537       exit_gsi = gsi_after_labels (reduc_block);
5538     }
5539
5540   /* Shouldn't be used beyond this point.  */
5541   exit_bb = nullptr;
5542
5543   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
5544       && reduc_fn != IFN_LAST)
5545     {
5546       /* For condition reductions, we have a vector (REDUC_INPUTS 0) containing
5547          various data values where the condition matched and another vector
5548          (INDUCTION_INDEX) containing all the indexes of those matches.  We
5549          need to extract the last matching index (which will be the index with
5550          highest value) and use this to index into the data vector.
5551          For the case where there were no matches, the data vector will contain
5552          all default values and the index vector will be all zeros.  */
5553
5554       /* Get various versions of the type of the vector of indexes.  */
5555       tree index_vec_type = TREE_TYPE (induction_index);
5556       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
5557       tree index_scalar_type = TREE_TYPE (index_vec_type);
5558       tree index_vec_cmp_type = truth_type_for (index_vec_type);
5559
5560       /* Get an unsigned integer version of the type of the data vector.  */
5561       int scalar_precision
5562         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
5563       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
5564       tree vectype_unsigned = get_same_sized_vectype (scalar_type_unsigned,
5565                                                 vectype);
5566
5567       /* First we need to create a vector (ZERO_VEC) of zeros and another
5568          vector (MAX_INDEX_VEC) filled with the last matching index, which we
5569          can create using a MAX reduction and then expanding.
5570          In the case where the loop never made any matches, the max index will
5571          be zero.  */
5572
5573       /* Vector of {0, 0, 0,...}.  */
5574       tree zero_vec = build_zero_cst (vectype);
5575
5576       /* Find maximum value from the vector of found indexes.  */
5577       tree max_index = make_ssa_name (index_scalar_type);
5578       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5579                                                           1, induction_index);
5580       gimple_call_set_lhs (max_index_stmt, max_index);
5581       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
5582
5583       /* Vector of {max_index, max_index, max_index,...}.  */
5584       tree max_index_vec = make_ssa_name (index_vec_type);
5585       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
5586                                                       max_index);
5587       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
5588                                                         max_index_vec_rhs);
5589       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
5590
5591       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
5592          with the vector (INDUCTION_INDEX) of found indexes, choosing values
5593          from the data vector (REDUC_INPUTS 0) for matches, 0 (ZERO_VEC)
5594          otherwise.  Only one value should match, resulting in a vector
5595          (VEC_COND) with one data value and the rest zeros.
5596          In the case where the loop never made any matches, every index will
5597          match, resulting in a vector with all data values (which will all be
5598          the default value).  */
5599
5600       /* Compare the max index vector to the vector of found indexes to find
5601          the position of the max value.  */
5602       tree vec_compare = make_ssa_name (index_vec_cmp_type);
5603       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
5604                                                       induction_index,
5605                                                       max_index_vec);
5606       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
5607
5608       /* Use the compare to choose either values from the data vector or
5609          zero.  */
5610       tree vec_cond = make_ssa_name (vectype);
5611       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
5612                                                    vec_compare,
5613                                                    reduc_inputs[0],
5614                                                    zero_vec);
5615       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
5616
5617       /* Finally we need to extract the data value from the vector (VEC_COND)
5618          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
5619          reduction, but because this doesn't exist, we can use a MAX reduction
5620          instead.  The data value might be signed or a float so we need to cast
5621          it first.
5622          In the case where the loop never made any matches, the data values are
5623          all identical, and so will reduce down correctly.  */
5624
5625       /* Make the matched data values unsigned.  */
5626       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
5627       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
5628                                        vec_cond);
5629       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
5630                                                         VIEW_CONVERT_EXPR,
5631                                                         vec_cond_cast_rhs);
5632       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
5633
5634       /* Reduce down to a scalar value.  */
5635       tree data_reduc = make_ssa_name (scalar_type_unsigned);
5636       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5637                                                            1, vec_cond_cast);
5638       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
5639       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
5640
5641       /* Convert the reduced value back to the result type and set as the
5642          result.  */
5643       gimple_seq stmts = NULL;
5644       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
5645                                data_reduc);
5646       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5647       scalar_results.safe_push (new_temp);
5648     }
5649   else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
5650            && reduc_fn == IFN_LAST)
5651     {
5652       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
5653          idx = 0;
5654          idx_val = induction_index[0];
5655          val = data_reduc[0];
5656          for (idx = 0, val = init, i = 0; i < nelts; ++i)
5657            if (induction_index[i] > idx_val)
5658              val = data_reduc[i], idx_val = induction_index[i];
5659          return val;  */
5660
5661       tree data_eltype = TREE_TYPE (vectype);
5662       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5663       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5664       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5665       /* Enforced by vectorizable_reduction, which ensures we have target
5666          support before allowing a conditional reduction on variable-length
5667          vectors.  */
5668       unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5669       tree idx_val = NULL_TREE, val = NULL_TREE;
5670       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5671         {
5672           tree old_idx_val = idx_val;
5673           tree old_val = val;
5674           idx_val = make_ssa_name (idx_eltype);
5675           epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5676                                              build3 (BIT_FIELD_REF, idx_eltype,
5677                                                      induction_index,
5678                                                      bitsize_int (el_size),
5679                                                      bitsize_int (off)));
5680           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5681           val = make_ssa_name (data_eltype);
5682           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5683                                              build3 (BIT_FIELD_REF,
5684                                                      data_eltype,
5685                                                      reduc_inputs[0],
5686                                                      bitsize_int (el_size),
5687                                                      bitsize_int (off)));
5688           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5689           if (off != 0)
5690             {
5691               tree new_idx_val = idx_val;
5692               if (off != v_size - el_size)
5693                 {
5694                   new_idx_val = make_ssa_name (idx_eltype);
5695                   epilog_stmt = gimple_build_assign (new_idx_val,
5696                                                      MAX_EXPR, idx_val,
5697                                                      old_idx_val);
5698                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5699                 }
5700               tree new_val = make_ssa_name (data_eltype);
5701               epilog_stmt = gimple_build_assign (new_val,
5702                                                  COND_EXPR,
5703                                                  build2 (GT_EXPR,
5704                                                          boolean_type_node,
5705                                                          idx_val,
5706                                                          old_idx_val),
5707                                                  val, old_val);
5708               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5709               idx_val = new_idx_val;
5710               val = new_val;
5711             }
5712         }
5713       /* Convert the reduced value back to the result type and set as the
5714          result.  */
5715       gimple_seq stmts = NULL;
5716       val = gimple_convert (&stmts, scalar_type, val);
5717       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5718       scalar_results.safe_push (val);
5719     }
5720
5721   /* 2.3 Create the reduction code, using one of the three schemes described
5722          above. In SLP we simply need to extract all the elements from the
5723          vector (without reducing them), so we use scalar shifts.  */
5724   else if (reduc_fn != IFN_LAST && !slp_reduc)
5725     {
5726       tree tmp;
5727       tree vec_elem_type;
5728
5729       /* Case 1:  Create:
5730          v_out2 = reduc_expr <v_out1>  */
5731
5732       if (dump_enabled_p ())
5733         dump_printf_loc (MSG_NOTE, vect_location,
5734                          "Reduce using direct vector reduction.\n");
5735
5736       gimple_seq stmts = NULL;
5737       vec_elem_type = TREE_TYPE (vectype);
5738       new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
5739                                vec_elem_type, reduc_inputs[0]);
5740       new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5741       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5742
5743       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5744           && induc_val)
5745         {
5746           /* Earlier we set the initial value to be a vector if induc_val
5747              values.  Check the result and if it is induc_val then replace
5748              with the original initial value, unless induc_val is
5749              the same as initial_def already.  */
5750           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5751                                   induc_val);
5752           tree initial_def = reduc_info->reduc_initial_values[0];
5753
5754           tmp = make_ssa_name (new_scalar_dest);
5755           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5756                                              initial_def, new_temp);
5757           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5758           new_temp = tmp;
5759         }
5760
5761       scalar_results.safe_push (new_temp);
5762     }
5763   else if (direct_slp_reduc)
5764     {
5765       /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5766          with the elements for other SLP statements replaced with the
5767          neutral value.  We can then do a normal reduction on each vector.  */
5768
5769       /* Enforced by vectorizable_reduction.  */
5770       gcc_assert (reduc_inputs.length () == 1);
5771       gcc_assert (pow2p_hwi (group_size));
5772
5773       gimple_seq seq = NULL;
5774
5775       /* Build a vector {0, 1, 2, ...}, with the same number of elements
5776          and the same element size as VECTYPE.  */
5777       tree index = build_index_vector (vectype, 0, 1);
5778       tree index_type = TREE_TYPE (index);
5779       tree index_elt_type = TREE_TYPE (index_type);
5780       tree mask_type = truth_type_for (index_type);
5781
5782       /* Create a vector that, for each element, identifies which of
5783          the REDUC_GROUP_SIZE results should use it.  */
5784       tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5785       index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5786                             build_vector_from_val (index_type, index_mask));
5787
5788       /* Get a neutral vector value.  This is simply a splat of the neutral
5789          scalar value if we have one, otherwise the initial scalar value
5790          is itself a neutral value.  */
5791       tree vector_identity = NULL_TREE;
5792       tree neutral_op = NULL_TREE;
5793       if (slp_node)
5794         {
5795           tree initial_value = NULL_TREE;
5796           if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5797             initial_value = reduc_info->reduc_initial_values[0];
5798           neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
5799                                                  initial_value);
5800         }
5801       if (neutral_op)
5802         vector_identity = gimple_build_vector_from_val (&seq, vectype,
5803                                                         neutral_op);
5804       for (unsigned int i = 0; i < group_size; ++i)
5805         {
5806           /* If there's no univeral neutral value, we can use the
5807              initial scalar value from the original PHI.  This is used
5808              for MIN and MAX reduction, for example.  */
5809           if (!neutral_op)
5810             {
5811               tree scalar_value = reduc_info->reduc_initial_values[i];
5812               scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
5813                                              scalar_value);
5814               vector_identity = gimple_build_vector_from_val (&seq, vectype,
5815                                                               scalar_value);
5816             }
5817
5818           /* Calculate the equivalent of:
5819
5820              sel[j] = (index[j] == i);
5821
5822              which selects the elements of REDUC_INPUTS[0] that should
5823              be included in the result.  */
5824           tree compare_val = build_int_cst (index_elt_type, i);
5825           compare_val = build_vector_from_val (index_type, compare_val);
5826           tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5827                                    index, compare_val);
5828
5829           /* Calculate the equivalent of:
5830
5831              vec = seq ? reduc_inputs[0] : vector_identity;
5832
5833              VEC is now suitable for a full vector reduction.  */
5834           tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5835                                    sel, reduc_inputs[0], vector_identity);
5836
5837           /* Do the reduction and convert it to the appropriate type.  */
5838           tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5839                                       TREE_TYPE (vectype), vec);
5840           scalar = gimple_convert (&seq, scalar_type, scalar);
5841           scalar_results.safe_push (scalar);
5842         }
5843       gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5844     }
5845   else
5846     {
5847       bool reduce_with_shift;
5848       tree vec_temp;
5849
5850       gcc_assert (slp_reduc || reduc_inputs.length () == 1);
5851
5852       /* See if the target wants to do the final (shift) reduction
5853          in a vector mode of smaller size and first reduce upper/lower
5854          halves against each other.  */
5855       enum machine_mode mode1 = mode;
5856       tree stype = TREE_TYPE (vectype);
5857       unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5858       unsigned nunits1 = nunits;
5859       if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
5860           && reduc_inputs.length () == 1)
5861         {
5862           nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5863           /* For SLP reductions we have to make sure lanes match up, but
5864              since we're doing individual element final reduction reducing
5865              vector width here is even more important.
5866              ???  We can also separate lanes with permutes, for the common
5867              case of power-of-two group-size odd/even extracts would work.  */
5868           if (slp_reduc && nunits != nunits1)
5869             {
5870               nunits1 = least_common_multiple (nunits1, group_size);
5871               gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
5872             }
5873         }
5874       if (!slp_reduc
5875           && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5876         nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5877
5878       tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5879                                                            stype, nunits1);
5880       reduce_with_shift = have_whole_vector_shift (mode1);
5881       if (!VECTOR_MODE_P (mode1)
5882           || !directly_supported_p (code, vectype1))
5883         reduce_with_shift = false;
5884
5885       /* First reduce the vector to the desired vector size we should
5886          do shift reduction on by combining upper and lower halves.  */
5887       gimple_seq stmts = NULL;
5888       new_temp = vect_create_partial_epilog (reduc_inputs[0], vectype1,
5889                                              code, &stmts);
5890       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5891       reduc_inputs[0] = new_temp;
5892
5893       if (reduce_with_shift && !slp_reduc)
5894         {
5895           int element_bitsize = tree_to_uhwi (bitsize);
5896           /* Enforced by vectorizable_reduction, which disallows SLP reductions
5897              for variable-length vectors and also requires direct target support
5898              for loop reductions.  */
5899           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5900           int nelements = vec_size_in_bits / element_bitsize;
5901           vec_perm_builder sel;
5902           vec_perm_indices indices;
5903
5904           int elt_offset;
5905
5906           tree zero_vec = build_zero_cst (vectype1);
5907           /* Case 2: Create:
5908              for (offset = nelements/2; offset >= 1; offset/=2)
5909                 {
5910                   Create:  va' = vec_shift <va, offset>
5911                   Create:  va = vop <va, va'>
5912                 }  */
5913
5914           tree rhs;
5915
5916           if (dump_enabled_p ())
5917             dump_printf_loc (MSG_NOTE, vect_location,
5918                              "Reduce using vector shifts\n");
5919
5920           gimple_seq stmts = NULL;
5921           new_temp = gimple_convert (&stmts, vectype1, new_temp);
5922           for (elt_offset = nelements / 2;
5923                elt_offset >= 1;
5924                elt_offset /= 2)
5925             {
5926               calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5927               indices.new_vector (sel, 2, nelements);
5928               tree mask = vect_gen_perm_mask_any (vectype1, indices);
5929               new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
5930                                        new_temp, zero_vec, mask);
5931               new_temp = gimple_build (&stmts, code,
5932                                        vectype1, new_name, new_temp);
5933             }
5934           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5935
5936           /* 2.4  Extract the final scalar result.  Create:
5937              s_out3 = extract_field <v_out2, bitpos>  */
5938
5939           if (dump_enabled_p ())
5940             dump_printf_loc (MSG_NOTE, vect_location,
5941                              "extract scalar result\n");
5942
5943           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5944                         bitsize, bitsize_zero_node);
5945           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5946           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5947           gimple_assign_set_lhs (epilog_stmt, new_temp);
5948           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5949           scalar_results.safe_push (new_temp);
5950         }
5951       else
5952         {
5953           /* Case 3: Create:
5954              s = extract_field <v_out2, 0>
5955              for (offset = element_size;
5956                   offset < vector_size;
5957                   offset += element_size;)
5958                {
5959                  Create:  s' = extract_field <v_out2, offset>
5960                  Create:  s = op <s, s'>  // For non SLP cases
5961                }  */
5962
5963           if (dump_enabled_p ())
5964             dump_printf_loc (MSG_NOTE, vect_location,
5965                              "Reduce using scalar code.\n");
5966
5967           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5968           int element_bitsize = tree_to_uhwi (bitsize);
5969           tree compute_type = TREE_TYPE (vectype);
5970           gimple_seq stmts = NULL;
5971           FOR_EACH_VEC_ELT (reduc_inputs, i, vec_temp)
5972             {
5973               int bit_offset;
5974               new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
5975                                        vec_temp, bitsize, bitsize_zero_node);
5976
5977               /* In SLP we don't need to apply reduction operation, so we just
5978                  collect s' values in SCALAR_RESULTS.  */
5979               if (slp_reduc)
5980                 scalar_results.safe_push (new_temp);
5981
5982               for (bit_offset = element_bitsize;
5983                    bit_offset < vec_size_in_bits;
5984                    bit_offset += element_bitsize)
5985                 {
5986                   tree bitpos = bitsize_int (bit_offset);
5987                   new_name = gimple_build (&stmts, BIT_FIELD_REF,
5988                                            compute_type, vec_temp,
5989                                            bitsize, bitpos);
5990                   if (slp_reduc)
5991                     {
5992                       /* In SLP we don't need to apply reduction operation, so
5993                          we just collect s' values in SCALAR_RESULTS.  */
5994                       new_temp = new_name;
5995                       scalar_results.safe_push (new_name);
5996                     }
5997                   else
5998                     new_temp = gimple_build (&stmts, code, compute_type,
5999                                              new_name, new_temp);
6000                 }
6001             }
6002
6003           /* The only case where we need to reduce scalar results in SLP, is
6004              unrolling.  If the size of SCALAR_RESULTS is greater than
6005              REDUC_GROUP_SIZE, we reduce them combining elements modulo
6006              REDUC_GROUP_SIZE.  */
6007           if (slp_reduc)
6008             {
6009               tree res, first_res, new_res;
6010
6011               /* Reduce multiple scalar results in case of SLP unrolling.  */
6012               for (j = group_size; scalar_results.iterate (j, &res);
6013                    j++)
6014                 {
6015                   first_res = scalar_results[j % group_size];
6016                   new_res = gimple_build (&stmts, code, compute_type,
6017                                           first_res, res);
6018                   scalar_results[j % group_size] = new_res;
6019                 }
6020               scalar_results.truncate (group_size);
6021               for (k = 0; k < group_size; k++)
6022                 scalar_results[k] = gimple_convert (&stmts, scalar_type,
6023                                                     scalar_results[k]);
6024             }
6025           else
6026             {
6027               /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
6028               new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6029               scalar_results.safe_push (new_temp);
6030             }
6031
6032           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6033         }
6034
6035       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6036           && induc_val)
6037         {
6038           /* Earlier we set the initial value to be a vector if induc_val
6039              values.  Check the result and if it is induc_val then replace
6040              with the original initial value, unless induc_val is
6041              the same as initial_def already.  */
6042           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
6043                                   induc_val);
6044           tree initial_def = reduc_info->reduc_initial_values[0];
6045
6046           tree tmp = make_ssa_name (new_scalar_dest);
6047           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6048                                              initial_def, new_temp);
6049           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6050           scalar_results[0] = tmp;
6051         }
6052     }
6053
6054   /* 2.5 Adjust the final result by the initial value of the reduction
6055          variable. (When such adjustment is not needed, then
6056          'adjustment_def' is zero).  For example, if code is PLUS we create:
6057          new_temp = loop_exit_def + adjustment_def  */
6058
6059   if (adjustment_def)
6060     {
6061       gcc_assert (!slp_reduc);
6062       gimple_seq stmts = NULL;
6063       if (double_reduc)
6064         {
6065           gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
6066           adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
6067           new_temp = gimple_build (&stmts, code, vectype,
6068                                    reduc_inputs[0], adjustment_def);
6069         }
6070       else
6071         {
6072           new_temp = scalar_results[0];
6073           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
6074           adjustment_def = gimple_convert (&stmts, scalar_type, adjustment_def);
6075           new_temp = gimple_build (&stmts, code, scalar_type,
6076                                    new_temp, adjustment_def);
6077         }
6078
6079       epilog_stmt = gimple_seq_last_stmt (stmts);
6080       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6081       scalar_results[0] = new_temp;
6082     }
6083
6084   /* Record this operation if it could be reused by the epilogue loop.  */
6085   if (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION)
6086     loop_vinfo->reusable_accumulators.put (scalar_results[0],
6087                                            { orig_reduc_input, reduc_info });
6088
6089   if (double_reduc)
6090     loop = outer_loop;
6091
6092   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
6093           phis with new adjusted scalar results, i.e., replace use <s_out0>
6094           with use <s_out4>.
6095
6096      Transform:
6097         loop_exit:
6098           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
6099           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
6100           v_out2 = reduce <v_out1>
6101           s_out3 = extract_field <v_out2, 0>
6102           s_out4 = adjust_result <s_out3>
6103           use <s_out0>
6104           use <s_out0>
6105
6106      into:
6107
6108         loop_exit:
6109           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
6110           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
6111           v_out2 = reduce <v_out1>
6112           s_out3 = extract_field <v_out2, 0>
6113           s_out4 = adjust_result <s_out3>
6114           use <s_out4>
6115           use <s_out4> */
6116
6117   gcc_assert (live_out_stmts.size () == scalar_results.length ());
6118   for (k = 0; k < live_out_stmts.size (); k++)
6119     {
6120       stmt_vec_info scalar_stmt_info = vect_orig_stmt (live_out_stmts[k]);
6121       scalar_dest = gimple_get_lhs (scalar_stmt_info->stmt);
6122
6123       phis.create (3);
6124       /* Find the loop-closed-use at the loop exit of the original scalar
6125          result.  (The reduction result is expected to have two immediate uses,
6126          one at the latch block, and one at the loop exit).  For double
6127          reductions we are looking for exit phis of the outer loop.  */
6128       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6129         {
6130           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6131             {
6132               if (!is_gimple_debug (USE_STMT (use_p)))
6133                 phis.safe_push (USE_STMT (use_p));
6134             }
6135           else
6136             {
6137               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
6138                 {
6139                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
6140
6141                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
6142                     {
6143                       if (!flow_bb_inside_loop_p (loop,
6144                                              gimple_bb (USE_STMT (phi_use_p)))
6145                           && !is_gimple_debug (USE_STMT (phi_use_p)))
6146                         phis.safe_push (USE_STMT (phi_use_p));
6147                     }
6148                 }
6149             }
6150         }
6151
6152       FOR_EACH_VEC_ELT (phis, i, exit_phi)
6153         {
6154           /* Replace the uses:  */
6155           orig_name = PHI_RESULT (exit_phi);
6156
6157           /* Look for a single use at the target of the skip edge.  */
6158           if (unify_with_main_loop_p)
6159             {
6160               use_operand_p use_p;
6161               gimple *user;
6162               if (!single_imm_use (orig_name, &use_p, &user))
6163                 gcc_unreachable ();
6164               orig_name = gimple_get_lhs (user);
6165             }
6166
6167           scalar_result = scalar_results[k];
6168           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6169             {
6170               FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6171                 SET_USE (use_p, scalar_result);
6172               update_stmt (use_stmt);
6173             }
6174         }
6175
6176       phis.release ();
6177     }
6178 }
6179
6180 /* Return a vector of type VECTYPE that is equal to the vector select
6181    operation "MASK ? VEC : IDENTITY".  Insert the select statements
6182    before GSI.  */
6183
6184 static tree
6185 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
6186                      tree vec, tree identity)
6187 {
6188   tree cond = make_temp_ssa_name (vectype, NULL, "cond");
6189   gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
6190                                           mask, vec, identity);
6191   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6192   return cond;
6193 }
6194
6195 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6196    order, starting with LHS.  Insert the extraction statements before GSI and
6197    associate the new scalar SSA names with variable SCALAR_DEST.
6198    Return the SSA name for the result.  */
6199
6200 static tree
6201 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
6202                        tree_code code, tree lhs, tree vector_rhs)
6203 {
6204   tree vectype = TREE_TYPE (vector_rhs);
6205   tree scalar_type = TREE_TYPE (vectype);
6206   tree bitsize = TYPE_SIZE (scalar_type);
6207   unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
6208   unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
6209
6210   for (unsigned HOST_WIDE_INT bit_offset = 0;
6211        bit_offset < vec_size_in_bits;
6212        bit_offset += element_bitsize)
6213     {
6214       tree bitpos = bitsize_int (bit_offset);
6215       tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
6216                          bitsize, bitpos);
6217
6218       gassign *stmt = gimple_build_assign (scalar_dest, rhs);
6219       rhs = make_ssa_name (scalar_dest, stmt);
6220       gimple_assign_set_lhs (stmt, rhs);
6221       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6222
6223       stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
6224       tree new_name = make_ssa_name (scalar_dest, stmt);
6225       gimple_assign_set_lhs (stmt, new_name);
6226       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6227       lhs = new_name;
6228     }
6229   return lhs;
6230 }
6231
6232 /* Get a masked internal function equivalent to REDUC_FN.  VECTYPE_IN is the
6233    type of the vector input.  */
6234
6235 static internal_fn
6236 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
6237 {
6238   internal_fn mask_reduc_fn;
6239
6240   switch (reduc_fn)
6241     {
6242     case IFN_FOLD_LEFT_PLUS:
6243       mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
6244       break;
6245
6246     default:
6247       return IFN_LAST;
6248     }
6249
6250   if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
6251                                       OPTIMIZE_FOR_SPEED))
6252     return mask_reduc_fn;
6253   return IFN_LAST;
6254 }
6255
6256 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT_INFO is the
6257    statement that sets the live-out value.  REDUC_DEF_STMT is the phi
6258    statement.  CODE is the operation performed by STMT_INFO and OPS are
6259    its scalar operands.  REDUC_INDEX is the index of the operand in
6260    OPS that is set by REDUC_DEF_STMT.  REDUC_FN is the function that
6261    implements in-order reduction, or IFN_LAST if we should open-code it.
6262    VECTYPE_IN is the type of the vector input.  MASKS specifies the masks
6263    that should be used to control the operation in a fully-masked loop.  */
6264
6265 static bool
6266 vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
6267                                stmt_vec_info stmt_info,
6268                                gimple_stmt_iterator *gsi,
6269                                gimple **vec_stmt, slp_tree slp_node,
6270                                gimple *reduc_def_stmt,
6271                                tree_code code, internal_fn reduc_fn,
6272                                tree ops[3], tree vectype_in,
6273                                int reduc_index, vec_loop_masks *masks)
6274 {
6275   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6276   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6277   internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
6278
6279   int ncopies;
6280   if (slp_node)
6281     ncopies = 1;
6282   else
6283     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6284
6285   gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
6286   gcc_assert (ncopies == 1);
6287   gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
6288
6289   if (slp_node)
6290     gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
6291                           TYPE_VECTOR_SUBPARTS (vectype_in)));
6292
6293   tree op0 = ops[1 - reduc_index];
6294
6295   int group_size = 1;
6296   stmt_vec_info scalar_dest_def_info;
6297   auto_vec<tree> vec_oprnds0;
6298   if (slp_node)
6299     {
6300       auto_vec<vec<tree> > vec_defs (2);
6301       vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs);
6302       vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
6303       vec_defs[0].release ();
6304       vec_defs[1].release ();
6305       group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6306       scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
6307     }
6308   else
6309     {
6310       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
6311                                      op0, &vec_oprnds0);
6312       scalar_dest_def_info = stmt_info;
6313     }
6314
6315   tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
6316   tree scalar_type = TREE_TYPE (scalar_dest);
6317   tree reduc_var = gimple_phi_result (reduc_def_stmt);
6318
6319   int vec_num = vec_oprnds0.length ();
6320   gcc_assert (vec_num == 1 || slp_node);
6321   tree vec_elem_type = TREE_TYPE (vectype_out);
6322   gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
6323
6324   tree vector_identity = NULL_TREE;
6325   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6326     vector_identity = build_zero_cst (vectype_out);
6327
6328   tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
6329   int i;
6330   tree def0;
6331   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6332     {
6333       gimple *new_stmt;
6334       tree mask = NULL_TREE;
6335       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6336         mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
6337
6338       /* Handle MINUS by adding the negative.  */
6339       if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
6340         {
6341           tree negated = make_ssa_name (vectype_out);
6342           new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
6343           gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6344           def0 = negated;
6345         }
6346
6347       if (mask && mask_reduc_fn == IFN_LAST)
6348         def0 = merge_with_identity (gsi, mask, vectype_out, def0,
6349                                     vector_identity);
6350
6351       /* On the first iteration the input is simply the scalar phi
6352          result, and for subsequent iterations it is the output of
6353          the preceding operation.  */
6354       if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
6355         {
6356           if (mask && mask_reduc_fn != IFN_LAST)
6357             new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
6358                                                    def0, mask);
6359           else
6360             new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
6361                                                    def0);
6362           /* For chained SLP reductions the output of the previous reduction
6363              operation serves as the input of the next. For the final statement
6364              the output cannot be a temporary - we reuse the original
6365              scalar destination of the last statement.  */
6366           if (i != vec_num - 1)
6367             {
6368               gimple_set_lhs (new_stmt, scalar_dest_var);
6369               reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
6370               gimple_set_lhs (new_stmt, reduc_var);
6371             }
6372         }
6373       else
6374         {
6375           reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
6376                                              reduc_var, def0);
6377           new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6378           /* Remove the statement, so that we can use the same code paths
6379              as for statements that we've just created.  */
6380           gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6381           gsi_remove (&tmp_gsi, true);
6382         }
6383
6384       if (i == vec_num - 1)
6385         {
6386           gimple_set_lhs (new_stmt, scalar_dest);
6387           vect_finish_replace_stmt (loop_vinfo,
6388                                     scalar_dest_def_info,
6389                                     new_stmt);
6390         }
6391       else
6392         vect_finish_stmt_generation (loop_vinfo,
6393                                      scalar_dest_def_info,
6394                                      new_stmt, gsi);
6395
6396       if (slp_node)
6397         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6398       else
6399         {
6400           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
6401           *vec_stmt = new_stmt;
6402         }
6403     }
6404
6405   return true;
6406 }
6407
6408 /* Function is_nonwrapping_integer_induction.
6409
6410    Check if STMT_VINO (which is part of loop LOOP) both increments and
6411    does not cause overflow.  */
6412
6413 static bool
6414 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
6415 {
6416   gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
6417   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
6418   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
6419   tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
6420   widest_int ni, max_loop_value, lhs_max;
6421   wi::overflow_type overflow = wi::OVF_NONE;
6422
6423   /* Make sure the loop is integer based.  */
6424   if (TREE_CODE (base) != INTEGER_CST
6425       || TREE_CODE (step) != INTEGER_CST)
6426     return false;
6427
6428   /* Check that the max size of the loop will not wrap.  */
6429
6430   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6431     return true;
6432
6433   if (! max_stmt_executions (loop, &ni))
6434     return false;
6435
6436   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6437                             &overflow);
6438   if (overflow)
6439     return false;
6440
6441   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6442                             TYPE_SIGN (lhs_type), &overflow);
6443   if (overflow)
6444     return false;
6445
6446   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6447           <= TYPE_PRECISION (lhs_type));
6448 }
6449
6450 /* Check if masking can be supported by inserting a conditional expression.
6451    CODE is the code for the operation.  COND_FN is the conditional internal
6452    function, if it exists.  VECTYPE_IN is the type of the vector input.  */
6453 static bool
6454 use_mask_by_cond_expr_p (code_helper code, internal_fn cond_fn,
6455                          tree vectype_in)
6456 {
6457   if (cond_fn != IFN_LAST
6458       && direct_internal_fn_supported_p (cond_fn, vectype_in,
6459                                          OPTIMIZE_FOR_SPEED))
6460     return false;
6461
6462   if (code.is_tree_code ())
6463     switch (tree_code (code))
6464       {
6465       case DOT_PROD_EXPR:
6466       case SAD_EXPR:
6467         return true;
6468
6469       default:
6470         break;
6471       }
6472   return false;
6473 }
6474
6475 /* Insert a conditional expression to enable masked vectorization.  CODE is the
6476    code for the operation.  VOP is the array of operands.  MASK is the loop
6477    mask.  GSI is a statement iterator used to place the new conditional
6478    expression.  */
6479 static void
6480 build_vect_cond_expr (code_helper code, tree vop[3], tree mask,
6481                       gimple_stmt_iterator *gsi)
6482 {
6483   switch (tree_code (code))
6484     {
6485     case DOT_PROD_EXPR:
6486       {
6487         tree vectype = TREE_TYPE (vop[1]);
6488         tree zero = build_zero_cst (vectype);
6489         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6490         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6491                                                mask, vop[1], zero);
6492         gsi_insert_before (gsi, select, GSI_SAME_STMT);
6493         vop[1] = masked_op1;
6494         break;
6495       }
6496
6497     case SAD_EXPR:
6498       {
6499         tree vectype = TREE_TYPE (vop[1]);
6500         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6501         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6502                                                mask, vop[1], vop[0]);
6503         gsi_insert_before (gsi, select, GSI_SAME_STMT);
6504         vop[1] = masked_op1;
6505         break;
6506       }
6507
6508     default:
6509       gcc_unreachable ();
6510     }
6511 }
6512
6513 /* Function vectorizable_reduction.
6514
6515    Check if STMT_INFO performs a reduction operation that can be vectorized.
6516    If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
6517    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6518    Return true if STMT_INFO is vectorizable in this way.
6519
6520    This function also handles reduction idioms (patterns) that have been
6521    recognized in advance during vect_pattern_recog.  In this case, STMT_INFO
6522    may be of this form:
6523      X = pattern_expr (arg0, arg1, ..., X)
6524    and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
6525    sequence that had been detected and replaced by the pattern-stmt
6526    (STMT_INFO).
6527
6528    This function also handles reduction of condition expressions, for example:
6529      for (int i = 0; i < N; i++)
6530        if (a[i] < value)
6531          last = a[i];
6532    This is handled by vectorising the loop and creating an additional vector
6533    containing the loop indexes for which "a[i] < value" was true.  In the
6534    function epilogue this is reduced to a single max value and then used to
6535    index into the vector of results.
6536
6537    In some cases of reduction patterns, the type of the reduction variable X is
6538    different than the type of the other arguments of STMT_INFO.
6539    In such cases, the vectype that is used when transforming STMT_INFO into
6540    a vector stmt is different than the vectype that is used to determine the
6541    vectorization factor, because it consists of a different number of elements
6542    than the actual number of elements that are being operated upon in parallel.
6543
6544    For example, consider an accumulation of shorts into an int accumulator.
6545    On some targets it's possible to vectorize this pattern operating on 8
6546    shorts at a time (hence, the vectype for purposes of determining the
6547    vectorization factor should be V8HI); on the other hand, the vectype that
6548    is used to create the vector form is actually V4SI (the type of the result).
6549
6550    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6551    indicates what is the actual level of parallelism (V8HI in the example), so
6552    that the right vectorization factor would be derived.  This vectype
6553    corresponds to the type of arguments to the reduction stmt, and should *NOT*
6554    be used to create the vectorized stmt.  The right vectype for the vectorized
6555    stmt is obtained from the type of the result X:
6556       get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6557
6558    This means that, contrary to "regular" reductions (or "regular" stmts in
6559    general), the following equation:
6560       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6561    does *NOT* necessarily hold for reduction patterns.  */
6562
6563 bool
6564 vectorizable_reduction (loop_vec_info loop_vinfo,
6565                         stmt_vec_info stmt_info, slp_tree slp_node,
6566                         slp_instance slp_node_instance,
6567                         stmt_vector_for_cost *cost_vec)
6568 {
6569   tree vectype_in = NULL_TREE;
6570   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6571   enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
6572   stmt_vec_info cond_stmt_vinfo = NULL;
6573   int i;
6574   int ncopies;
6575   bool single_defuse_cycle = false;
6576   bool nested_cycle = false;
6577   bool double_reduc = false;
6578   int vec_num;
6579   tree tem;
6580   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6581   tree cond_reduc_val = NULL_TREE;
6582
6583   /* Make sure it was already recognized as a reduction computation.  */
6584   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
6585       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
6586       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
6587     return false;
6588
6589   /* The stmt we store reduction analysis meta on.  */
6590   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
6591   reduc_info->is_reduc_info = true;
6592
6593   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
6594     {
6595       if (is_a <gphi *> (stmt_info->stmt))
6596         {
6597           if (slp_node)
6598             {
6599               /* We eventually need to set a vector type on invariant
6600                  arguments.  */
6601               unsigned j;
6602               slp_tree child;
6603               FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
6604                 if (!vect_maybe_update_slp_op_vectype
6605                        (child, SLP_TREE_VECTYPE (slp_node)))
6606                   {
6607                     if (dump_enabled_p ())
6608                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6609                                        "incompatible vector types for "
6610                                        "invariants\n");
6611                     return false;
6612                   }
6613             }
6614           /* Analysis for double-reduction is done on the outer
6615              loop PHI, nested cycles have no further restrictions.  */
6616           STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
6617         }
6618       else
6619         STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6620       return true;
6621     }
6622
6623   stmt_vec_info orig_stmt_of_analysis = stmt_info;
6624   stmt_vec_info phi_info = stmt_info;
6625   if (!is_a <gphi *> (stmt_info->stmt))
6626     {
6627       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6628       return true;
6629     }
6630   if (slp_node)
6631     {
6632       slp_node_instance->reduc_phis = slp_node;
6633       /* ???  We're leaving slp_node to point to the PHIs, we only
6634          need it to get at the number of vector stmts which wasn't
6635          yet initialized for the instance root.  */
6636     }
6637   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
6638     {
6639       use_operand_p use_p;
6640       gimple *use_stmt;
6641       bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
6642                                  &use_p, &use_stmt);
6643       gcc_assert (res);
6644       phi_info = loop_vinfo->lookup_stmt (use_stmt);
6645     }
6646
6647   /* PHIs should not participate in patterns.  */
6648   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6649   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
6650
6651   /* Verify following REDUC_IDX from the latch def leads us back to the PHI
6652      and compute the reduction chain length.  Discover the real
6653      reduction operation stmt on the way (stmt_info and slp_for_stmt_info).  */
6654   tree reduc_def
6655     = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
6656                              loop_latch_edge
6657                                (gimple_bb (reduc_def_phi)->loop_father));
6658   unsigned reduc_chain_length = 0;
6659   bool only_slp_reduc_chain = true;
6660   stmt_info = NULL;
6661   slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
6662   while (reduc_def != PHI_RESULT (reduc_def_phi))
6663     {
6664       stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
6665       stmt_vec_info vdef = vect_stmt_to_vectorize (def);
6666       if (STMT_VINFO_REDUC_IDX (vdef) == -1)
6667         {
6668           if (dump_enabled_p ())
6669             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6670                              "reduction chain broken by patterns.\n");
6671           return false;
6672         }
6673       if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
6674         only_slp_reduc_chain = false;
6675       /* ???  For epilogue generation live members of the chain need
6676          to point back to the PHI via their original stmt for
6677          info_for_reduction to work.  */
6678       if (STMT_VINFO_LIVE_P (vdef))
6679         STMT_VINFO_REDUC_DEF (def) = phi_info;
6680       gimple_match_op op;
6681       if (!gimple_extract_op (vdef->stmt, &op))
6682         {
6683           if (dump_enabled_p ())
6684             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6685                              "reduction chain includes unsupported"
6686                              " statement type.\n");
6687           return false;
6688         }
6689       if (CONVERT_EXPR_CODE_P (op.code))
6690         {
6691           if (!tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
6692             {
6693               if (dump_enabled_p ())
6694                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6695                                  "conversion in the reduction chain.\n");
6696               return false;
6697             }
6698         }
6699       else if (!stmt_info)
6700         /* First non-conversion stmt.  */
6701         stmt_info = vdef;
6702       reduc_def = op.ops[STMT_VINFO_REDUC_IDX (vdef)];
6703       reduc_chain_length++;
6704       if (!stmt_info && slp_node)
6705         slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
6706     }
6707   /* PHIs should not participate in patterns.  */
6708   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6709
6710   if (nested_in_vect_loop_p (loop, stmt_info))
6711     {
6712       loop = loop->inner;
6713       nested_cycle = true;
6714     }
6715
6716   /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
6717      element.  */
6718   if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6719     {
6720       gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
6721       stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
6722     }
6723   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6724     gcc_assert (slp_node
6725                 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
6726
6727   /* 1. Is vectorizable reduction?  */
6728   /* Not supportable if the reduction variable is used in the loop, unless
6729      it's a reduction chain.  */
6730   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6731       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6732     return false;
6733
6734   /* Reductions that are not used even in an enclosing outer-loop,
6735      are expected to be "live" (used out of the loop).  */
6736   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6737       && !STMT_VINFO_LIVE_P (stmt_info))
6738     return false;
6739
6740   /* 2. Has this been recognized as a reduction pattern?
6741
6742      Check if STMT represents a pattern that has been recognized
6743      in earlier analysis stages.  For stmts that represent a pattern,
6744      the STMT_VINFO_RELATED_STMT field records the last stmt in
6745      the original sequence that constitutes the pattern.  */
6746
6747   stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6748   if (orig_stmt_info)
6749     {
6750       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6751       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6752     }
6753
6754   /* 3. Check the operands of the operation.  The first operands are defined
6755         inside the loop body. The last operand is the reduction variable,
6756         which is defined by the loop-header-phi.  */
6757
6758   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6759   STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
6760   gimple_match_op op;
6761   if (!gimple_extract_op (stmt_info->stmt, &op))
6762     gcc_unreachable ();
6763   bool lane_reduc_code_p = (op.code == DOT_PROD_EXPR
6764                             || op.code == WIDEN_SUM_EXPR
6765                             || op.code == SAD_EXPR);
6766   enum optab_subtype optab_query_kind = optab_vector;
6767   if (op.code == DOT_PROD_EXPR
6768       && (TYPE_SIGN (TREE_TYPE (op.ops[0]))
6769           != TYPE_SIGN (TREE_TYPE (op.ops[1]))))
6770     optab_query_kind = optab_vector_mixed_sign;
6771
6772   if (!POINTER_TYPE_P (op.type) && !INTEGRAL_TYPE_P (op.type)
6773       && !SCALAR_FLOAT_TYPE_P (op.type))
6774     return false;
6775
6776   /* Do not try to vectorize bit-precision reductions.  */
6777   if (!type_has_mode_precision_p (op.type))
6778     return false;
6779
6780   /* For lane-reducing ops we're reducing the number of reduction PHIs
6781      which means the only use of that may be in the lane-reducing operation.  */
6782   if (lane_reduc_code_p
6783       && reduc_chain_length != 1
6784       && !only_slp_reduc_chain)
6785     {
6786       if (dump_enabled_p ())
6787         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6788                          "lane-reducing reduction with extra stmts.\n");
6789       return false;
6790     }
6791
6792   /* All uses but the last are expected to be defined in the loop.
6793      The last use is the reduction variable.  In case of nested cycle this
6794      assumption is not true: we use reduc_index to record the index of the
6795      reduction variable.  */
6796   slp_tree *slp_op = XALLOCAVEC (slp_tree, op.num_ops);
6797   /* We need to skip an extra operand for COND_EXPRs with embedded
6798      comparison.  */
6799   unsigned opno_adjust = 0;
6800   if (op.code == COND_EXPR && COMPARISON_CLASS_P (op.ops[0]))
6801     opno_adjust = 1;
6802   for (i = 0; i < (int) op.num_ops; i++)
6803     {
6804       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
6805       if (i == 0 && op.code == COND_EXPR)
6806         continue;
6807
6808       stmt_vec_info def_stmt_info;
6809       enum vect_def_type dt;
6810       if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info,
6811                                i + opno_adjust, &op.ops[i], &slp_op[i], &dt,
6812                                &tem, &def_stmt_info))
6813         {
6814           if (dump_enabled_p ())
6815             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6816                              "use not simple.\n");
6817           return false;
6818         }
6819       if (i == STMT_VINFO_REDUC_IDX (stmt_info))
6820         continue;
6821
6822       /* There should be only one cycle def in the stmt, the one
6823          leading to reduc_def.  */
6824       if (VECTORIZABLE_CYCLE_DEF (dt))
6825         return false;
6826
6827       /* To properly compute ncopies we are interested in the widest
6828          non-reduction input type in case we're looking at a widening
6829          accumulation that we later handle in vect_transform_reduction.  */
6830       if (lane_reduc_code_p
6831           && tem
6832           && (!vectype_in
6833               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6834                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem))))))
6835         vectype_in = tem;
6836
6837       if (op.code == COND_EXPR)
6838         {
6839           /* Record how the non-reduction-def value of COND_EXPR is defined.  */
6840           if (dt == vect_constant_def)
6841             {
6842               cond_reduc_dt = dt;
6843               cond_reduc_val = op.ops[i];
6844             }
6845           if (dt == vect_induction_def
6846               && def_stmt_info
6847               && is_nonwrapping_integer_induction (def_stmt_info, loop))
6848             {
6849               cond_reduc_dt = dt;
6850               cond_stmt_vinfo = def_stmt_info;
6851             }
6852         }
6853     }
6854   if (!vectype_in)
6855     vectype_in = STMT_VINFO_VECTYPE (phi_info);
6856   STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
6857
6858   enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
6859   STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
6860   /* If we have a condition reduction, see if we can simplify it further.  */
6861   if (v_reduc_type == COND_REDUCTION)
6862     {
6863       if (slp_node)
6864         return false;
6865
6866       /* When the condition uses the reduction value in the condition, fail.  */
6867       if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
6868         {
6869           if (dump_enabled_p ())
6870             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6871                              "condition depends on previous iteration\n");
6872           return false;
6873         }
6874
6875       if (reduc_chain_length == 1
6876           && direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6877                                              vectype_in, OPTIMIZE_FOR_SPEED))
6878         {
6879           if (dump_enabled_p ())
6880             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6881                              "optimizing condition reduction with"
6882                              " FOLD_EXTRACT_LAST.\n");
6883           STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
6884         }
6885       else if (cond_reduc_dt == vect_induction_def)
6886         {
6887           tree base
6888             = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6889           tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6890
6891           gcc_assert (TREE_CODE (base) == INTEGER_CST
6892                       && TREE_CODE (step) == INTEGER_CST);
6893           cond_reduc_val = NULL_TREE;
6894           enum tree_code cond_reduc_op_code = ERROR_MARK;
6895           tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
6896           if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
6897             ;
6898           /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6899              above base; punt if base is the minimum value of the type for
6900              MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
6901           else if (tree_int_cst_sgn (step) == -1)
6902             {
6903               cond_reduc_op_code = MIN_EXPR;
6904               if (tree_int_cst_sgn (base) == -1)
6905                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6906               else if (tree_int_cst_lt (base,
6907                                         TYPE_MAX_VALUE (TREE_TYPE (base))))
6908                 cond_reduc_val
6909                   = int_const_binop (PLUS_EXPR, base, integer_one_node);
6910             }
6911           else
6912             {
6913               cond_reduc_op_code = MAX_EXPR;
6914               if (tree_int_cst_sgn (base) == 1)
6915                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6916               else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6917                                         base))
6918                 cond_reduc_val
6919                   = int_const_binop (MINUS_EXPR, base, integer_one_node);
6920             }
6921           if (cond_reduc_val)
6922             {
6923               if (dump_enabled_p ())
6924                 dump_printf_loc (MSG_NOTE, vect_location,
6925                                  "condition expression based on "
6926                                  "integer induction.\n");
6927               STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
6928               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
6929                 = cond_reduc_val;
6930               STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
6931             }
6932         }
6933       else if (cond_reduc_dt == vect_constant_def)
6934         {
6935           enum vect_def_type cond_initial_dt;
6936           tree cond_initial_val = vect_phi_initial_value (reduc_def_phi);
6937           vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
6938           if (cond_initial_dt == vect_constant_def
6939               && types_compatible_p (TREE_TYPE (cond_initial_val),
6940                                      TREE_TYPE (cond_reduc_val)))
6941             {
6942               tree e = fold_binary (LE_EXPR, boolean_type_node,
6943                                     cond_initial_val, cond_reduc_val);
6944               if (e && (integer_onep (e) || integer_zerop (e)))
6945                 {
6946                   if (dump_enabled_p ())
6947                     dump_printf_loc (MSG_NOTE, vect_location,
6948                                      "condition expression based on "
6949                                      "compile time constant.\n");
6950                   /* Record reduction code at analysis stage.  */
6951                   STMT_VINFO_REDUC_CODE (reduc_info)
6952                     = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6953                   STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
6954                 }
6955             }
6956         }
6957     }
6958
6959   if (STMT_VINFO_LIVE_P (phi_info))
6960     return false;
6961
6962   if (slp_node)
6963     ncopies = 1;
6964   else
6965     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6966
6967   gcc_assert (ncopies >= 1);
6968
6969   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6970
6971   if (nested_cycle)
6972     {
6973       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
6974                   == vect_double_reduction_def);
6975       double_reduc = true;
6976     }
6977
6978   /* 4.2. Check support for the epilog operation.
6979
6980           If STMT represents a reduction pattern, then the type of the
6981           reduction variable may be different than the type of the rest
6982           of the arguments.  For example, consider the case of accumulation
6983           of shorts into an int accumulator; The original code:
6984                         S1: int_a = (int) short_a;
6985           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
6986
6987           was replaced with:
6988                         STMT: int_acc = widen_sum <short_a, int_acc>
6989
6990           This means that:
6991           1. The tree-code that is used to create the vector operation in the
6992              epilog code (that reduces the partial results) is not the
6993              tree-code of STMT, but is rather the tree-code of the original
6994              stmt from the pattern that STMT is replacing.  I.e, in the example
6995              above we want to use 'widen_sum' in the loop, but 'plus' in the
6996              epilog.
6997           2. The type (mode) we use to check available target support
6998              for the vector operation to be created in the *epilog*, is
6999              determined by the type of the reduction variable (in the example
7000              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
7001              However the type (mode) we use to check available target support
7002              for the vector operation to be created *inside the loop*, is
7003              determined by the type of the other arguments to STMT (in the
7004              example we'd check this: optab_handler (widen_sum_optab,
7005              vect_short_mode)).
7006
7007           This is contrary to "regular" reductions, in which the types of all
7008           the arguments are the same as the type of the reduction variable.
7009           For "regular" reductions we can therefore use the same vector type
7010           (and also the same tree-code) when generating the epilog code and
7011           when generating the code inside the loop.  */
7012
7013   code_helper orig_code = STMT_VINFO_REDUC_CODE (phi_info);
7014   STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
7015
7016   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7017   if (reduction_type == TREE_CODE_REDUCTION)
7018     {
7019       /* Check whether it's ok to change the order of the computation.
7020          Generally, when vectorizing a reduction we change the order of the
7021          computation.  This may change the behavior of the program in some
7022          cases, so we need to check that this is ok.  One exception is when
7023          vectorizing an outer-loop: the inner-loop is executed sequentially,
7024          and therefore vectorizing reductions in the inner-loop during
7025          outer-loop vectorization is safe.  Likewise when we are vectorizing
7026          a series of reductions using SLP and the VF is one the reductions
7027          are performed in scalar order.  */
7028       if (slp_node
7029           && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7030           && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1u))
7031         ;
7032       else if (needs_fold_left_reduction_p (op.type, orig_code))
7033         {
7034           /* When vectorizing a reduction chain w/o SLP the reduction PHI
7035              is not directy used in stmt.  */
7036           if (!only_slp_reduc_chain
7037               && reduc_chain_length != 1)
7038             {
7039               if (dump_enabled_p ())
7040                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7041                                  "in-order reduction chain without SLP.\n");
7042               return false;
7043             }
7044           STMT_VINFO_REDUC_TYPE (reduc_info)
7045             = reduction_type = FOLD_LEFT_REDUCTION;
7046         }
7047       else if (!commutative_binary_op_p (orig_code, op.type)
7048                || !associative_binary_op_p (orig_code, op.type))
7049         {
7050           if (dump_enabled_p ())
7051             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7052                             "reduction: not commutative/associative");
7053           return false;
7054         }
7055     }
7056
7057   if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
7058       && ncopies > 1)
7059     {
7060       if (dump_enabled_p ())
7061         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7062                          "multiple types in double reduction or condition "
7063                          "reduction or fold-left reduction.\n");
7064       return false;
7065     }
7066
7067   internal_fn reduc_fn = IFN_LAST;
7068   if (reduction_type == TREE_CODE_REDUCTION
7069       || reduction_type == FOLD_LEFT_REDUCTION
7070       || reduction_type == INTEGER_INDUC_COND_REDUCTION
7071       || reduction_type == CONST_COND_REDUCTION)
7072     {
7073       if (reduction_type == FOLD_LEFT_REDUCTION
7074           ? fold_left_reduction_fn (orig_code, &reduc_fn)
7075           : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
7076         {
7077           if (reduc_fn != IFN_LAST
7078               && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
7079                                                   OPTIMIZE_FOR_SPEED))
7080             {
7081               if (dump_enabled_p ())
7082                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7083                                  "reduc op not supported by target.\n");
7084
7085               reduc_fn = IFN_LAST;
7086             }
7087         }
7088       else
7089         {
7090           if (!nested_cycle || double_reduc)
7091             {
7092               if (dump_enabled_p ())
7093                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7094                                  "no reduc code for scalar code.\n");
7095
7096               return false;
7097             }
7098         }
7099     }
7100   else if (reduction_type == COND_REDUCTION)
7101     {
7102       int scalar_precision
7103         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (op.type));
7104       cr_index_scalar_type = make_unsigned_type (scalar_precision);
7105       cr_index_vector_type = get_same_sized_vectype (cr_index_scalar_type,
7106                                                 vectype_out);
7107
7108       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
7109                                           OPTIMIZE_FOR_SPEED))
7110         reduc_fn = IFN_REDUC_MAX;
7111     }
7112   STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
7113
7114   if (reduction_type != EXTRACT_LAST_REDUCTION
7115       && (!nested_cycle || double_reduc)
7116       && reduc_fn == IFN_LAST
7117       && !nunits_out.is_constant ())
7118     {
7119       if (dump_enabled_p ())
7120         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7121                          "missing target support for reduction on"
7122                          " variable-length vectors.\n");
7123       return false;
7124     }
7125
7126   /* For SLP reductions, see if there is a neutral value we can use.  */
7127   tree neutral_op = NULL_TREE;
7128   if (slp_node)
7129     {
7130       tree initial_value = NULL_TREE;
7131       if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL)
7132         initial_value = vect_phi_initial_value (reduc_def_phi);
7133       neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype_out),
7134                                              orig_code, initial_value);
7135     }
7136
7137   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
7138     {
7139       /* We can't support in-order reductions of code such as this:
7140
7141            for (int i = 0; i < n1; ++i)
7142              for (int j = 0; j < n2; ++j)
7143                l += a[j];
7144
7145          since GCC effectively transforms the loop when vectorizing:
7146
7147            for (int i = 0; i < n1 / VF; ++i)
7148              for (int j = 0; j < n2; ++j)
7149                for (int k = 0; k < VF; ++k)
7150                  l += a[j];
7151
7152          which is a reassociation of the original operation.  */
7153       if (dump_enabled_p ())
7154         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7155                          "in-order double reduction not supported.\n");
7156
7157       return false;
7158     }
7159
7160   if (reduction_type == FOLD_LEFT_REDUCTION
7161       && slp_node
7162       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7163     {
7164       /* We cannot use in-order reductions in this case because there is
7165          an implicit reassociation of the operations involved.  */
7166       if (dump_enabled_p ())
7167         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7168                          "in-order unchained SLP reductions not supported.\n");
7169       return false;
7170     }
7171
7172   /* For double reductions, and for SLP reductions with a neutral value,
7173      we construct a variable-length initial vector by loading a vector
7174      full of the neutral value and then shift-and-inserting the start
7175      values into the low-numbered elements.  */
7176   if ((double_reduc || neutral_op)
7177       && !nunits_out.is_constant ()
7178       && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
7179                                           vectype_out, OPTIMIZE_FOR_SPEED))
7180     {
7181       if (dump_enabled_p ())
7182         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7183                          "reduction on variable-length vectors requires"
7184                          " target support for a vector-shift-and-insert"
7185                          " operation.\n");
7186       return false;
7187     }
7188
7189   /* Check extra constraints for variable-length unchained SLP reductions.  */
7190   if (STMT_SLP_TYPE (stmt_info)
7191       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7192       && !nunits_out.is_constant ())
7193     {
7194       /* We checked above that we could build the initial vector when
7195          there's a neutral element value.  Check here for the case in
7196          which each SLP statement has its own initial value and in which
7197          that value needs to be repeated for every instance of the
7198          statement within the initial vector.  */
7199       unsigned int group_size = SLP_TREE_LANES (slp_node);
7200       if (!neutral_op
7201           && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
7202                                               TREE_TYPE (vectype_out)))
7203         {
7204           if (dump_enabled_p ())
7205             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7206                              "unsupported form of SLP reduction for"
7207                              " variable-length vectors: cannot build"
7208                              " initial vector.\n");
7209           return false;
7210         }
7211       /* The epilogue code relies on the number of elements being a multiple
7212          of the group size.  The duplicate-and-interleave approach to setting
7213          up the initial vector does too.  */
7214       if (!multiple_p (nunits_out, group_size))
7215         {
7216           if (dump_enabled_p ())
7217             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7218                              "unsupported form of SLP reduction for"
7219                              " variable-length vectors: the vector size"
7220                              " is not a multiple of the number of results.\n");
7221           return false;
7222         }
7223     }
7224
7225   if (reduction_type == COND_REDUCTION)
7226     {
7227       widest_int ni;
7228
7229       if (! max_loop_iterations (loop, &ni))
7230         {
7231           if (dump_enabled_p ())
7232             dump_printf_loc (MSG_NOTE, vect_location,
7233                              "loop count not known, cannot create cond "
7234                              "reduction.\n");
7235           return false;
7236         }
7237       /* Convert backedges to iterations.  */
7238       ni += 1;
7239
7240       /* The additional index will be the same type as the condition.  Check
7241          that the loop can fit into this less one (because we'll use up the
7242          zero slot for when there are no matches).  */
7243       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
7244       if (wi::geu_p (ni, wi::to_widest (max_index)))
7245         {
7246           if (dump_enabled_p ())
7247             dump_printf_loc (MSG_NOTE, vect_location,
7248                              "loop size is greater than data size.\n");
7249           return false;
7250         }
7251     }
7252
7253   /* In case the vectorization factor (VF) is bigger than the number
7254      of elements that we can fit in a vectype (nunits), we have to generate
7255      more than one vector stmt - i.e - we need to "unroll" the
7256      vector stmt by a factor VF/nunits.  For more details see documentation
7257      in vectorizable_operation.  */
7258
7259   /* If the reduction is used in an outer loop we need to generate
7260      VF intermediate results, like so (e.g. for ncopies=2):
7261         r0 = phi (init, r0)
7262         r1 = phi (init, r1)
7263         r0 = x0 + r0;
7264         r1 = x1 + r1;
7265     (i.e. we generate VF results in 2 registers).
7266     In this case we have a separate def-use cycle for each copy, and therefore
7267     for each copy we get the vector def for the reduction variable from the
7268     respective phi node created for this copy.
7269
7270     Otherwise (the reduction is unused in the loop nest), we can combine
7271     together intermediate results, like so (e.g. for ncopies=2):
7272         r = phi (init, r)
7273         r = x0 + r;
7274         r = x1 + r;
7275    (i.e. we generate VF/2 results in a single register).
7276    In this case for each copy we get the vector def for the reduction variable
7277    from the vectorized reduction operation generated in the previous iteration.
7278
7279    This only works when we see both the reduction PHI and its only consumer
7280    in vectorizable_reduction and there are no intermediate stmts
7281    participating.  When unrolling we want each unrolled iteration to have its
7282    own reduction accumulator since one of the main goals of unrolling a
7283    reduction is to reduce the aggregate loop-carried latency.  */
7284   if (ncopies > 1
7285       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
7286       && reduc_chain_length == 1
7287       && loop_vinfo->suggested_unroll_factor == 1)
7288     single_defuse_cycle = true;
7289
7290   if (single_defuse_cycle || lane_reduc_code_p)
7291     {
7292       gcc_assert (op.code != COND_EXPR);
7293
7294       /* 4. Supportable by target?  */
7295       bool ok = true;
7296
7297       /* 4.1. check support for the operation in the loop  */
7298       machine_mode vec_mode = TYPE_MODE (vectype_in);
7299       if (!directly_supported_p (op.code, vectype_in, optab_query_kind))
7300         {
7301           if (dump_enabled_p ())
7302             dump_printf (MSG_NOTE, "op not supported by target.\n");
7303           if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
7304               || !vect_can_vectorize_without_simd_p (op.code))
7305             ok = false;
7306           else
7307             if (dump_enabled_p ())
7308               dump_printf (MSG_NOTE, "proceeding using word mode.\n");
7309         }
7310
7311       if (vect_emulated_vector_p (vectype_in)
7312           && !vect_can_vectorize_without_simd_p (op.code))
7313         {
7314           if (dump_enabled_p ())
7315             dump_printf (MSG_NOTE, "using word mode not possible.\n");
7316           return false;
7317         }
7318
7319       /* lane-reducing operations have to go through vect_transform_reduction.
7320          For the other cases try without the single cycle optimization.  */
7321       if (!ok)
7322         {
7323           if (lane_reduc_code_p)
7324             return false;
7325           else
7326             single_defuse_cycle = false;
7327         }
7328     }
7329   STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
7330
7331   /* If the reduction stmt is one of the patterns that have lane
7332      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
7333   if ((ncopies > 1 && ! single_defuse_cycle)
7334       && lane_reduc_code_p)
7335     {
7336       if (dump_enabled_p ())
7337         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7338                          "multi def-use cycle not possible for lane-reducing "
7339                          "reduction operation\n");
7340       return false;
7341     }
7342
7343   if (slp_node
7344       && !(!single_defuse_cycle
7345            && !lane_reduc_code_p
7346            && reduction_type != FOLD_LEFT_REDUCTION))
7347     for (i = 0; i < (int) op.num_ops; i++)
7348       if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_in))
7349         {
7350           if (dump_enabled_p ())
7351             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7352                              "incompatible vector types for invariants\n");
7353           return false;
7354         }
7355
7356   if (slp_node)
7357     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7358   else
7359     vec_num = 1;
7360
7361   vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
7362                              reduction_type, ncopies, cost_vec);
7363   /* Cost the reduction op inside the loop if transformed via
7364      vect_transform_reduction.  Otherwise this is costed by the
7365      separate vectorizable_* routines.  */
7366   if (single_defuse_cycle || lane_reduc_code_p)
7367     record_stmt_cost (cost_vec, ncopies, vector_stmt, stmt_info, 0, vect_body);
7368
7369   if (dump_enabled_p ()
7370       && reduction_type == FOLD_LEFT_REDUCTION)
7371     dump_printf_loc (MSG_NOTE, vect_location,
7372                      "using an in-order (fold-left) reduction.\n");
7373   STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
7374   /* All but single defuse-cycle optimized, lane-reducing and fold-left
7375      reductions go through their own vectorizable_* routines.  */
7376   if (!single_defuse_cycle
7377       && !lane_reduc_code_p
7378       && reduction_type != FOLD_LEFT_REDUCTION)
7379     {
7380       stmt_vec_info tem
7381         = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
7382       if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
7383         {
7384           gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
7385           tem = REDUC_GROUP_FIRST_ELEMENT (tem);
7386         }
7387       STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
7388       STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
7389     }
7390   else if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
7391     {
7392       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7393       internal_fn cond_fn = get_conditional_internal_fn (op.code, op.type);
7394
7395       if (reduction_type != FOLD_LEFT_REDUCTION
7396           && !use_mask_by_cond_expr_p (op.code, cond_fn, vectype_in)
7397           && (cond_fn == IFN_LAST
7398               || !direct_internal_fn_supported_p (cond_fn, vectype_in,
7399                                                   OPTIMIZE_FOR_SPEED)))
7400         {
7401           if (dump_enabled_p ())
7402             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7403                              "can't operate on partial vectors because"
7404                              " no conditional operation is available.\n");
7405           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7406         }
7407       else if (reduction_type == FOLD_LEFT_REDUCTION
7408                && reduc_fn == IFN_LAST
7409                && !expand_vec_cond_expr_p (vectype_in,
7410                                            truth_type_for (vectype_in),
7411                                            SSA_NAME))
7412         {
7413           if (dump_enabled_p ())
7414             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7415                              "can't operate on partial vectors because"
7416                              " no conditional operation is available.\n");
7417           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7418         }
7419       else
7420         vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
7421                                vectype_in, NULL);
7422     }
7423   return true;
7424 }
7425
7426 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
7427    value.  */
7428
7429 bool
7430 vect_transform_reduction (loop_vec_info loop_vinfo,
7431                           stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
7432                           gimple **vec_stmt, slp_tree slp_node)
7433 {
7434   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7435   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7436   int i;
7437   int ncopies;
7438   int vec_num;
7439
7440   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7441   gcc_assert (reduc_info->is_reduc_info);
7442
7443   if (nested_in_vect_loop_p (loop, stmt_info))
7444     {
7445       loop = loop->inner;
7446       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
7447     }
7448
7449   gimple_match_op op;
7450   if (!gimple_extract_op (stmt_info->stmt, &op))
7451     gcc_unreachable ();
7452   gcc_assert (op.code.is_tree_code ());
7453   auto code = tree_code (op.code);
7454
7455   /* All uses but the last are expected to be defined in the loop.
7456      The last use is the reduction variable.  In case of nested cycle this
7457      assumption is not true: we use reduc_index to record the index of the
7458      reduction variable.  */
7459   stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
7460   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
7461   int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
7462   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7463
7464   if (slp_node)
7465     {
7466       ncopies = 1;
7467       vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7468     }
7469   else
7470     {
7471       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7472       vec_num = 1;
7473     }
7474
7475   internal_fn cond_fn = get_conditional_internal_fn (code);
7476   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7477   bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
7478
7479   /* Transform.  */
7480   tree new_temp = NULL_TREE;
7481   auto_vec<tree> vec_oprnds0;
7482   auto_vec<tree> vec_oprnds1;
7483   auto_vec<tree> vec_oprnds2;
7484   tree def0;
7485
7486   if (dump_enabled_p ())
7487     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7488
7489   /* FORNOW: Multiple types are not supported for condition.  */
7490   if (code == COND_EXPR)
7491     gcc_assert (ncopies == 1);
7492
7493   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7494
7495   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7496   if (reduction_type == FOLD_LEFT_REDUCTION)
7497     {
7498       internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
7499       return vectorize_fold_left_reduction
7500           (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, code,
7501            reduc_fn, op.ops, vectype_in, reduc_index, masks);
7502     }
7503
7504   bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
7505   gcc_assert (single_defuse_cycle
7506               || code == DOT_PROD_EXPR
7507               || code == WIDEN_SUM_EXPR
7508               || code == SAD_EXPR);
7509
7510   /* Create the destination vector  */
7511   tree scalar_dest = gimple_assign_lhs (stmt_info->stmt);
7512   tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7513
7514   vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
7515                      single_defuse_cycle && reduc_index == 0
7516                      ? NULL_TREE : op.ops[0], &vec_oprnds0,
7517                      single_defuse_cycle && reduc_index == 1
7518                      ? NULL_TREE : op.ops[1], &vec_oprnds1,
7519                      op.num_ops == 3
7520                      && !(single_defuse_cycle && reduc_index == 2)
7521                      ? op.ops[2] : NULL_TREE, &vec_oprnds2);
7522   if (single_defuse_cycle)
7523     {
7524       gcc_assert (!slp_node);
7525       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7526                                      op.ops[reduc_index],
7527                                      reduc_index == 0 ? &vec_oprnds0
7528                                      : (reduc_index == 1 ? &vec_oprnds1
7529                                         : &vec_oprnds2));
7530     }
7531
7532   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7533     {
7534       gimple *new_stmt;
7535       tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7536       if (masked_loop_p && !mask_by_cond_expr)
7537         {
7538           /* Make sure that the reduction accumulator is vop[0].  */
7539           if (reduc_index == 1)
7540             {
7541               gcc_assert (commutative_tree_code (code));
7542               std::swap (vop[0], vop[1]);
7543             }
7544           tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7545                                           vectype_in, i);
7546           gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
7547                                                     vop[0], vop[1], vop[0]);
7548           new_temp = make_ssa_name (vec_dest, call);
7549           gimple_call_set_lhs (call, new_temp);
7550           gimple_call_set_nothrow (call, true);
7551           vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
7552           new_stmt = call;
7553         }
7554       else
7555         {
7556           if (op.num_ops == 3)
7557             vop[2] = vec_oprnds2[i];
7558
7559           if (masked_loop_p && mask_by_cond_expr)
7560             {
7561               tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7562                                               vectype_in, i);
7563               build_vect_cond_expr (code, vop, mask, gsi);
7564             }
7565
7566           new_stmt = gimple_build_assign (vec_dest, code,
7567                                           vop[0], vop[1], vop[2]);
7568           new_temp = make_ssa_name (vec_dest, new_stmt);
7569           gimple_assign_set_lhs (new_stmt, new_temp);
7570           vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7571         }
7572
7573       if (slp_node)
7574         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7575       else if (single_defuse_cycle
7576                && i < ncopies - 1)
7577         {
7578           if (reduc_index == 0)
7579             vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
7580           else if (reduc_index == 1)
7581             vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
7582           else if (reduc_index == 2)
7583             vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
7584         }
7585       else
7586         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
7587     }
7588
7589   if (!slp_node)
7590     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
7591
7592   return true;
7593 }
7594
7595 /* Transform phase of a cycle PHI.  */
7596
7597 bool
7598 vect_transform_cycle_phi (loop_vec_info loop_vinfo,
7599                           stmt_vec_info stmt_info, gimple **vec_stmt,
7600                           slp_tree slp_node, slp_instance slp_node_instance)
7601 {
7602   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7603   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7604   int i;
7605   int ncopies;
7606   int j;
7607   bool nested_cycle = false;
7608   int vec_num;
7609
7610   if (nested_in_vect_loop_p (loop, stmt_info))
7611     {
7612       loop = loop->inner;
7613       nested_cycle = true;
7614     }
7615
7616   stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
7617   reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
7618   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7619   gcc_assert (reduc_info->is_reduc_info);
7620
7621   if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
7622       || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
7623     /* Leave the scalar phi in place.  */
7624     return true;
7625
7626   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7627   /* For a nested cycle we do not fill the above.  */
7628   if (!vectype_in)
7629     vectype_in = STMT_VINFO_VECTYPE (stmt_info);
7630   gcc_assert (vectype_in);
7631
7632   if (slp_node)
7633     {
7634       /* The size vect_schedule_slp_instance computes is off for us.  */
7635       vec_num = vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
7636                                       * SLP_TREE_LANES (slp_node), vectype_in);
7637       ncopies = 1;
7638     }
7639   else
7640     {
7641       vec_num = 1;
7642       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7643     }
7644
7645   /* Check whether we should use a single PHI node and accumulate
7646      vectors to one before the backedge.  */
7647   if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
7648     ncopies = 1;
7649
7650   /* Create the destination vector  */
7651   gphi *phi = as_a <gphi *> (stmt_info->stmt);
7652   tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
7653                                                vectype_out);
7654
7655   /* Get the loop-entry arguments.  */
7656   tree vec_initial_def = NULL_TREE;
7657   auto_vec<tree> vec_initial_defs;
7658   if (slp_node)
7659     {
7660       vec_initial_defs.reserve (vec_num);
7661       if (nested_cycle)
7662         {
7663           unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
7664           vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
7665                              &vec_initial_defs);
7666         }
7667       else
7668         {
7669           gcc_assert (slp_node == slp_node_instance->reduc_phis);
7670           vec<tree> &initial_values = reduc_info->reduc_initial_values;
7671           vec<stmt_vec_info> &stmts = SLP_TREE_SCALAR_STMTS (slp_node);
7672
7673           unsigned int num_phis = stmts.length ();
7674           if (REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
7675             num_phis = 1;
7676           initial_values.reserve (num_phis);
7677           for (unsigned int i = 0; i < num_phis; ++i)
7678             {
7679               gphi *this_phi = as_a<gphi *> (stmts[i]->stmt);
7680               initial_values.quick_push (vect_phi_initial_value (this_phi));
7681             }
7682           if (vec_num == 1)
7683             vect_find_reusable_accumulator (loop_vinfo, reduc_info);
7684           if (!initial_values.is_empty ())
7685             {
7686               tree initial_value
7687                 = (num_phis == 1 ? initial_values[0] : NULL_TREE);
7688               code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
7689               tree neutral_op
7690                 = neutral_op_for_reduction (TREE_TYPE (vectype_out),
7691                                             code, initial_value);
7692               get_initial_defs_for_reduction (loop_vinfo, reduc_info,
7693                                               &vec_initial_defs, vec_num,
7694                                               stmts.length (), neutral_op);
7695             }
7696         }
7697     }
7698   else
7699     {
7700       /* Get at the scalar def before the loop, that defines the initial
7701          value of the reduction variable.  */
7702       tree initial_def = vect_phi_initial_value (phi);
7703       reduc_info->reduc_initial_values.safe_push (initial_def);
7704       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
7705          and we can't use zero for induc_val, use initial_def.  Similarly
7706          for REDUC_MIN and initial_def larger than the base.  */
7707       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
7708         {
7709           tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
7710           if (TREE_CODE (initial_def) == INTEGER_CST
7711               && !integer_zerop (induc_val)
7712               && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
7713                    && tree_int_cst_lt (initial_def, induc_val))
7714                   || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
7715                       && tree_int_cst_lt (induc_val, initial_def))))
7716             {
7717               induc_val = initial_def;
7718               /* Communicate we used the initial_def to epilouge
7719                  generation.  */
7720               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
7721             }
7722           vec_initial_def = build_vector_from_val (vectype_out, induc_val);
7723         }
7724       else if (nested_cycle)
7725         {
7726           /* Do not use an adjustment def as that case is not supported
7727              correctly if ncopies is not one.  */
7728           vect_get_vec_defs_for_operand (loop_vinfo, reduc_stmt_info,
7729                                          ncopies, initial_def,
7730                                          &vec_initial_defs);
7731         }
7732       else if (STMT_VINFO_REDUC_TYPE (reduc_info) == CONST_COND_REDUCTION
7733                || STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
7734         /* Fill the initial vector with the initial scalar value.  */
7735         vec_initial_def
7736           = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info,
7737                                            initial_def, initial_def);
7738       else
7739         {
7740           if (ncopies == 1)
7741             vect_find_reusable_accumulator (loop_vinfo, reduc_info);
7742           if (!reduc_info->reduc_initial_values.is_empty ())
7743             {
7744               initial_def = reduc_info->reduc_initial_values[0];
7745               code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
7746               tree neutral_op
7747                 = neutral_op_for_reduction (TREE_TYPE (initial_def),
7748                                             code, initial_def);
7749               gcc_assert (neutral_op);
7750               /* Try to simplify the vector initialization by applying an
7751                  adjustment after the reduction has been performed.  */
7752               if (!reduc_info->reused_accumulator
7753                   && STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
7754                   && !operand_equal_p (neutral_op, initial_def))
7755                 {
7756                   STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info)
7757                     = initial_def;
7758                   initial_def = neutral_op;
7759                 }
7760               vec_initial_def
7761                 = get_initial_def_for_reduction (loop_vinfo, reduc_info,
7762                                                  initial_def, neutral_op);
7763             }
7764         }
7765     }
7766
7767   if (vec_initial_def)
7768     {
7769       vec_initial_defs.create (ncopies);
7770       for (i = 0; i < ncopies; ++i)
7771         vec_initial_defs.quick_push (vec_initial_def);
7772     }
7773
7774   if (auto *accumulator = reduc_info->reused_accumulator)
7775     {
7776       tree def = accumulator->reduc_input;
7777       if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
7778         {
7779           unsigned int nreduc;
7780           bool res = constant_multiple_p (TYPE_VECTOR_SUBPARTS
7781                                             (TREE_TYPE (def)),
7782                                           TYPE_VECTOR_SUBPARTS (vectype_out),
7783                                           &nreduc);
7784           gcc_assert (res);
7785           gimple_seq stmts = NULL;
7786           /* Reduce the single vector to a smaller one.  */
7787           if (nreduc != 1)
7788             {
7789               /* Perform the reduction in the appropriate type.  */
7790               tree rvectype = vectype_out;
7791               if (!useless_type_conversion_p (TREE_TYPE (vectype_out),
7792                                               TREE_TYPE (TREE_TYPE (def))))
7793                 rvectype = build_vector_type (TREE_TYPE (TREE_TYPE (def)),
7794                                               TYPE_VECTOR_SUBPARTS
7795                                                 (vectype_out));
7796               def = vect_create_partial_epilog (def, rvectype,
7797                                                 STMT_VINFO_REDUC_CODE
7798                                                   (reduc_info),
7799                                                 &stmts);
7800             }
7801           /* The epilogue loop might use a different vector mode, like
7802              VNx2DI vs. V2DI.  */
7803           if (TYPE_MODE (vectype_out) != TYPE_MODE (TREE_TYPE (def)))
7804             {
7805               tree reduc_type = build_vector_type_for_mode
7806                 (TREE_TYPE (TREE_TYPE (def)), TYPE_MODE (vectype_out));
7807               def = gimple_convert (&stmts, reduc_type, def);
7808             }
7809           /* Adjust the input so we pick up the partially reduced value
7810              for the skip edge in vect_create_epilog_for_reduction.  */
7811           accumulator->reduc_input = def;
7812           /* And the reduction could be carried out using a different sign.  */
7813           if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
7814             def = gimple_convert (&stmts, vectype_out, def);
7815           if (loop_vinfo->main_loop_edge)
7816             {
7817               /* While we'd like to insert on the edge this will split
7818                  blocks and disturb bookkeeping, we also will eventually
7819                  need this on the skip edge.  Rely on sinking to
7820                  fixup optimal placement and insert in the pred.  */
7821               gimple_stmt_iterator gsi
7822                 = gsi_last_bb (loop_vinfo->main_loop_edge->src);
7823               /* Insert before a cond that eventually skips the
7824                  epilogue.  */
7825               if (!gsi_end_p (gsi) && stmt_ends_bb_p (gsi_stmt (gsi)))
7826                 gsi_prev (&gsi);
7827               gsi_insert_seq_after (&gsi, stmts, GSI_CONTINUE_LINKING);
7828             }
7829           else
7830             gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop),
7831                                               stmts);
7832         }
7833       if (loop_vinfo->main_loop_edge)
7834         vec_initial_defs[0]
7835           = vect_get_main_loop_result (loop_vinfo, def,
7836                                        vec_initial_defs[0]);
7837       else
7838         vec_initial_defs.safe_push (def);
7839     }
7840
7841   /* Generate the reduction PHIs upfront.  */
7842   for (i = 0; i < vec_num; i++)
7843     {
7844       tree vec_init_def = vec_initial_defs[i];
7845       for (j = 0; j < ncopies; j++)
7846         {
7847           /* Create the reduction-phi that defines the reduction
7848              operand.  */
7849           gphi *new_phi = create_phi_node (vec_dest, loop->header);
7850
7851           /* Set the loop-entry arg of the reduction-phi.  */
7852           if (j != 0 && nested_cycle)
7853             vec_init_def = vec_initial_defs[j];
7854           add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
7855                        UNKNOWN_LOCATION);
7856
7857           /* The loop-latch arg is set in epilogue processing.  */
7858
7859           if (slp_node)
7860             SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
7861           else
7862             {
7863               if (j == 0)
7864                 *vec_stmt = new_phi;
7865               STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
7866             }
7867         }
7868     }
7869
7870   return true;
7871 }
7872
7873 /* Vectorizes LC PHIs.  */
7874
7875 bool
7876 vectorizable_lc_phi (loop_vec_info loop_vinfo,
7877                      stmt_vec_info stmt_info, gimple **vec_stmt,
7878                      slp_tree slp_node)
7879 {
7880   if (!loop_vinfo
7881       || !is_a <gphi *> (stmt_info->stmt)
7882       || gimple_phi_num_args (stmt_info->stmt) != 1)
7883     return false;
7884
7885   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
7886       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
7887     return false;
7888
7889   if (!vec_stmt) /* transformation not required.  */
7890     {
7891       /* Deal with copies from externs or constants that disguise as
7892          loop-closed PHI nodes (PR97886).  */
7893       if (slp_node
7894           && !vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0],
7895                                                 SLP_TREE_VECTYPE (slp_node)))
7896         {
7897           if (dump_enabled_p ())
7898             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7899                              "incompatible vector types for invariants\n");
7900           return false;
7901         }
7902       STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
7903       return true;
7904     }
7905
7906   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7907   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
7908   basic_block bb = gimple_bb (stmt_info->stmt);
7909   edge e = single_pred_edge (bb);
7910   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
7911   auto_vec<tree> vec_oprnds;
7912   vect_get_vec_defs (loop_vinfo, stmt_info, slp_node,
7913                      !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1,
7914                      gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
7915   for (unsigned i = 0; i < vec_oprnds.length (); i++)
7916     {
7917       /* Create the vectorized LC PHI node.  */
7918       gphi *new_phi = create_phi_node (vec_dest, bb);
7919       add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
7920       if (slp_node)
7921         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
7922       else
7923         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
7924     }
7925   if (!slp_node)
7926     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
7927
7928   return true;
7929 }
7930
7931 /* Vectorizes PHIs.  */
7932
7933 bool
7934 vectorizable_phi (vec_info *,
7935                   stmt_vec_info stmt_info, gimple **vec_stmt,
7936                   slp_tree slp_node, stmt_vector_for_cost *cost_vec)
7937 {
7938   if (!is_a <gphi *> (stmt_info->stmt) || !slp_node)
7939     return false;
7940
7941   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
7942     return false;
7943
7944   tree vectype = SLP_TREE_VECTYPE (slp_node);
7945
7946   if (!vec_stmt) /* transformation not required.  */
7947     {
7948       slp_tree child;
7949       unsigned i;
7950       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
7951         if (!child)
7952           {
7953             if (dump_enabled_p ())
7954               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7955                                "PHI node with unvectorized backedge def\n");
7956             return false;
7957           }
7958         else if (!vect_maybe_update_slp_op_vectype (child, vectype))
7959           {
7960             if (dump_enabled_p ())
7961               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7962                                "incompatible vector types for invariants\n");
7963             return false;
7964           }
7965         else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
7966                  && !useless_type_conversion_p (vectype,
7967                                                 SLP_TREE_VECTYPE (child)))
7968           {
7969             /* With bools we can have mask and non-mask precision vectors
7970                or different non-mask precisions.  while pattern recog is
7971                supposed to guarantee consistency here bugs in it can cause
7972                mismatches (PR103489 and PR103800 for example).
7973                Deal with them here instead of ICEing later.  */
7974             if (dump_enabled_p ())
7975               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7976                                "incompatible vector type setup from "
7977                                "bool pattern detection\n");
7978             return false;
7979           }
7980
7981       /* For single-argument PHIs assume coalescing which means zero cost
7982          for the scalar and the vector PHIs.  This avoids artificially
7983          favoring the vector path (but may pessimize it in some cases).  */
7984       if (gimple_phi_num_args (as_a <gphi *> (stmt_info->stmt)) > 1)
7985         record_stmt_cost (cost_vec, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
7986                           vector_stmt, stmt_info, vectype, 0, vect_body);
7987       STMT_VINFO_TYPE (stmt_info) = phi_info_type;
7988       return true;
7989     }
7990
7991   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
7992   basic_block bb = gimple_bb (stmt_info->stmt);
7993   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
7994   auto_vec<gphi *> new_phis;
7995   for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i)
7996     {
7997       slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
7998
7999       /* Skip not yet vectorized defs.  */
8000       if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
8001           && SLP_TREE_VEC_STMTS (child).is_empty ())
8002         continue;
8003
8004       auto_vec<tree> vec_oprnds;
8005       vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
8006       if (!new_phis.exists ())
8007         {
8008           new_phis.create (vec_oprnds.length ());
8009           for (unsigned j = 0; j < vec_oprnds.length (); j++)
8010             {
8011               /* Create the vectorized LC PHI node.  */
8012               new_phis.quick_push (create_phi_node (vec_dest, bb));
8013               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phis[j]);
8014             }
8015         }
8016       edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i);
8017       for (unsigned j = 0; j < vec_oprnds.length (); j++)
8018         add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
8019     }
8020   /* We should have at least one already vectorized child.  */
8021   gcc_assert (new_phis.exists ());
8022
8023   return true;
8024 }
8025
8026 /* Return true if VECTYPE represents a vector that requires lowering
8027    by the vector lowering pass.  */
8028
8029 bool
8030 vect_emulated_vector_p (tree vectype)
8031 {
8032   return (!VECTOR_MODE_P (TYPE_MODE (vectype))
8033           && (!VECTOR_BOOLEAN_TYPE_P (vectype)
8034               || TYPE_PRECISION (TREE_TYPE (vectype)) != 1));
8035 }
8036
8037 /* Return true if we can emulate CODE on an integer mode representation
8038    of a vector.  */
8039
8040 bool
8041 vect_can_vectorize_without_simd_p (tree_code code)
8042 {
8043   switch (code)
8044     {
8045     case PLUS_EXPR:
8046     case MINUS_EXPR:
8047     case NEGATE_EXPR:
8048     case BIT_AND_EXPR:
8049     case BIT_IOR_EXPR:
8050     case BIT_XOR_EXPR:
8051     case BIT_NOT_EXPR:
8052       return true;
8053
8054     default:
8055       return false;
8056     }
8057 }
8058
8059 /* Likewise, but taking a code_helper.  */
8060
8061 bool
8062 vect_can_vectorize_without_simd_p (code_helper code)
8063 {
8064   return (code.is_tree_code ()
8065           && vect_can_vectorize_without_simd_p (tree_code (code)));
8066 }
8067
8068 /* Function vectorizable_induction
8069
8070    Check if STMT_INFO performs an induction computation that can be vectorized.
8071    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
8072    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
8073    Return true if STMT_INFO is vectorizable in this way.  */
8074
8075 bool
8076 vectorizable_induction (loop_vec_info loop_vinfo,
8077                         stmt_vec_info stmt_info,
8078                         gimple **vec_stmt, slp_tree slp_node,
8079                         stmt_vector_for_cost *cost_vec)
8080 {
8081   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8082   unsigned ncopies;
8083   bool nested_in_vect_loop = false;
8084   class loop *iv_loop;
8085   tree vec_def;
8086   edge pe = loop_preheader_edge (loop);
8087   basic_block new_bb;
8088   tree new_vec, vec_init, vec_step, t;
8089   tree new_name;
8090   gimple *new_stmt;
8091   gphi *induction_phi;
8092   tree induc_def, vec_dest;
8093   tree init_expr, step_expr;
8094   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8095   unsigned i;
8096   tree expr;
8097   gimple_stmt_iterator si;
8098
8099   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
8100   if (!phi)
8101     return false;
8102
8103   if (!STMT_VINFO_RELEVANT_P (stmt_info))
8104     return false;
8105
8106   /* Make sure it was recognized as induction computation.  */
8107   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
8108     return false;
8109
8110   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8111   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8112
8113   if (slp_node)
8114     ncopies = 1;
8115   else
8116     ncopies = vect_get_num_copies (loop_vinfo, vectype);
8117   gcc_assert (ncopies >= 1);
8118
8119   /* FORNOW. These restrictions should be relaxed.  */
8120   if (nested_in_vect_loop_p (loop, stmt_info))
8121     {
8122       imm_use_iterator imm_iter;
8123       use_operand_p use_p;
8124       gimple *exit_phi;
8125       edge latch_e;
8126       tree loop_arg;
8127
8128       if (ncopies > 1)
8129         {
8130           if (dump_enabled_p ())
8131             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8132                              "multiple types in nested loop.\n");
8133           return false;
8134         }
8135
8136       exit_phi = NULL;
8137       latch_e = loop_latch_edge (loop->inner);
8138       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
8139       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
8140         {
8141           gimple *use_stmt = USE_STMT (use_p);
8142           if (is_gimple_debug (use_stmt))
8143             continue;
8144
8145           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
8146             {
8147               exit_phi = use_stmt;
8148               break;
8149             }
8150         }
8151       if (exit_phi)
8152         {
8153           stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
8154           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
8155                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
8156             {
8157               if (dump_enabled_p ())
8158                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8159                                  "inner-loop induction only used outside "
8160                                  "of the outer vectorized loop.\n");
8161               return false;
8162             }
8163         }
8164
8165       nested_in_vect_loop = true;
8166       iv_loop = loop->inner;
8167     }
8168   else
8169     iv_loop = loop;
8170   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
8171
8172   if (slp_node && !nunits.is_constant ())
8173     {
8174       /* The current SLP code creates the step value element-by-element.  */
8175       if (dump_enabled_p ())
8176         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8177                          "SLP induction not supported for variable-length"
8178                          " vectors.\n");
8179       return false;
8180     }
8181
8182   if (FLOAT_TYPE_P (vectype) && !param_vect_induction_float)
8183     {
8184       if (dump_enabled_p ())
8185         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8186                          "floating point induction vectorization disabled\n");
8187       return false;
8188     }
8189
8190   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
8191   gcc_assert (step_expr != NULL_TREE);
8192   tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
8193
8194   /* Check for backend support of PLUS/MINUS_EXPR. */
8195   if (!directly_supported_p (PLUS_EXPR, step_vectype)
8196       || !directly_supported_p (MINUS_EXPR, step_vectype))
8197     return false;
8198
8199   if (!vec_stmt) /* transformation not required.  */
8200     {
8201       unsigned inside_cost = 0, prologue_cost = 0;
8202       if (slp_node)
8203         {
8204           /* We eventually need to set a vector type on invariant
8205              arguments.  */
8206           unsigned j;
8207           slp_tree child;
8208           FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
8209             if (!vect_maybe_update_slp_op_vectype
8210                 (child, SLP_TREE_VECTYPE (slp_node)))
8211               {
8212                 if (dump_enabled_p ())
8213                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8214                                    "incompatible vector types for "
8215                                    "invariants\n");
8216                 return false;
8217               }
8218           /* loop cost for vec_loop.  */
8219           inside_cost
8220             = record_stmt_cost (cost_vec,
8221                                 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
8222                                 vector_stmt, stmt_info, 0, vect_body);
8223           /* prologue cost for vec_init (if not nested) and step.  */
8224           prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
8225                                             scalar_to_vec,
8226                                             stmt_info, 0, vect_prologue);
8227         }
8228       else /* if (!slp_node) */
8229         {
8230           /* loop cost for vec_loop.  */
8231           inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
8232                                           stmt_info, 0, vect_body);
8233           /* prologue cost for vec_init and vec_step.  */
8234           prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
8235                                             stmt_info, 0, vect_prologue);
8236         }
8237       if (dump_enabled_p ())
8238         dump_printf_loc (MSG_NOTE, vect_location,
8239                          "vect_model_induction_cost: inside_cost = %d, "
8240                          "prologue_cost = %d .\n", inside_cost,
8241                          prologue_cost);
8242
8243       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
8244       DUMP_VECT_SCOPE ("vectorizable_induction");
8245       return true;
8246     }
8247
8248   /* Transform.  */
8249
8250   /* Compute a vector variable, initialized with the first VF values of
8251      the induction variable.  E.g., for an iv with IV_PHI='X' and
8252      evolution S, for a vector of 4 units, we want to compute:
8253      [X, X + S, X + 2*S, X + 3*S].  */
8254
8255   if (dump_enabled_p ())
8256     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
8257
8258   pe = loop_preheader_edge (iv_loop);
8259   /* Find the first insertion point in the BB.  */
8260   basic_block bb = gimple_bb (phi);
8261   si = gsi_after_labels (bb);
8262
8263   /* For SLP induction we have to generate several IVs as for example
8264      with group size 3 we need
8265        [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
8266        [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2].  */
8267   if (slp_node)
8268     {
8269       /* Enforced above.  */
8270       unsigned int const_nunits = nunits.to_constant ();
8271
8272       /* The initial values are vectorized, but any lanes > group_size
8273          need adjustment.  */
8274       slp_tree init_node
8275         = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
8276
8277       /* Gather steps.  Since we do not vectorize inductions as
8278          cycles we have to reconstruct the step from SCEV data.  */
8279       unsigned group_size = SLP_TREE_LANES (slp_node);
8280       tree *steps = XALLOCAVEC (tree, group_size);
8281       tree *inits = XALLOCAVEC (tree, group_size);
8282       stmt_vec_info phi_info;
8283       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
8284         {
8285           steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
8286           if (!init_node)
8287             inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt),
8288                                            pe->dest_idx);
8289         }
8290
8291       /* Now generate the IVs.  */
8292       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8293       gcc_assert ((const_nunits * nvects) % group_size == 0);
8294       unsigned nivs;
8295       if (nested_in_vect_loop)
8296         nivs = nvects;
8297       else
8298         {
8299           /* Compute the number of distinct IVs we need.  First reduce
8300              group_size if it is a multiple of const_nunits so we get
8301              one IV for a group_size of 4 but const_nunits 2.  */
8302           unsigned group_sizep = group_size;
8303           if (group_sizep % const_nunits == 0)
8304             group_sizep = group_sizep / const_nunits;
8305           nivs = least_common_multiple (group_sizep,
8306                                         const_nunits) / const_nunits;
8307         }
8308       tree stept = TREE_TYPE (step_vectype);
8309       tree lupdate_mul = NULL_TREE;
8310       if (!nested_in_vect_loop)
8311         {
8312           /* The number of iterations covered in one vector iteration.  */
8313           unsigned lup_mul = (nvects * const_nunits) / group_size;
8314           lupdate_mul
8315             = build_vector_from_val (step_vectype,
8316                                      SCALAR_FLOAT_TYPE_P (stept)
8317                                      ? build_real_from_wide (stept, lup_mul,
8318                                                              UNSIGNED)
8319                                      : build_int_cstu (stept, lup_mul));
8320         }
8321       tree peel_mul = NULL_TREE;
8322       gimple_seq init_stmts = NULL;
8323       if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
8324         {
8325           if (SCALAR_FLOAT_TYPE_P (stept))
8326             peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept,
8327                                      LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
8328           else
8329             peel_mul = gimple_convert (&init_stmts, stept,
8330                                        LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
8331           peel_mul = gimple_build_vector_from_val (&init_stmts,
8332                                                    step_vectype, peel_mul);
8333         }
8334       unsigned ivn;
8335       auto_vec<tree> vec_steps;
8336       for (ivn = 0; ivn < nivs; ++ivn)
8337         {
8338           tree_vector_builder step_elts (step_vectype, const_nunits, 1);
8339           tree_vector_builder init_elts (vectype, const_nunits, 1);
8340           tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
8341           for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
8342             {
8343               /* The scalar steps of the IVs.  */
8344               tree elt = steps[(ivn*const_nunits + eltn) % group_size];
8345               elt = gimple_convert (&init_stmts, TREE_TYPE (step_vectype), elt);
8346               step_elts.quick_push (elt);
8347               if (!init_node)
8348                 {
8349                   /* The scalar inits of the IVs if not vectorized.  */
8350                   elt = inits[(ivn*const_nunits + eltn) % group_size];
8351                   if (!useless_type_conversion_p (TREE_TYPE (vectype),
8352                                                   TREE_TYPE (elt)))
8353                     elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
8354                                         TREE_TYPE (vectype), elt);
8355                   init_elts.quick_push (elt);
8356                 }
8357               /* The number of steps to add to the initial values.  */
8358               unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
8359               mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
8360                                    ? build_real_from_wide (stept,
8361                                                            mul_elt, UNSIGNED)
8362                                    : build_int_cstu (stept, mul_elt));
8363             }
8364           vec_step = gimple_build_vector (&init_stmts, &step_elts);
8365           vec_steps.safe_push (vec_step);
8366           tree step_mul = gimple_build_vector (&init_stmts, &mul_elts);
8367           if (peel_mul)
8368             step_mul = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
8369                                      step_mul, peel_mul);
8370           if (!init_node)
8371             vec_init = gimple_build_vector (&init_stmts, &init_elts);
8372
8373           /* Create the induction-phi that defines the induction-operand.  */
8374           vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
8375                                             "vec_iv_");
8376           induction_phi = create_phi_node (vec_dest, iv_loop->header);
8377           induc_def = PHI_RESULT (induction_phi);
8378
8379           /* Create the iv update inside the loop  */
8380           tree up = vec_step;
8381           if (lupdate_mul)
8382             up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
8383                                vec_step, lupdate_mul);
8384           gimple_seq stmts = NULL;
8385           vec_def = gimple_convert (&stmts, step_vectype, induc_def);
8386           vec_def = gimple_build (&stmts,
8387                                   PLUS_EXPR, step_vectype, vec_def, up);
8388           vec_def = gimple_convert (&stmts, vectype, vec_def);
8389           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8390           add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
8391                        UNKNOWN_LOCATION);
8392
8393           if (init_node)
8394             vec_init = vect_get_slp_vect_def (init_node, ivn);
8395           if (!nested_in_vect_loop
8396               && !integer_zerop (step_mul))
8397             {
8398               vec_def = gimple_convert (&init_stmts, step_vectype, vec_init);
8399               up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
8400                                  vec_step, step_mul);
8401               vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
8402                                       vec_def, up);
8403               vec_init = gimple_convert (&init_stmts, vectype, vec_def);
8404             }
8405
8406           /* Set the arguments of the phi node:  */
8407           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
8408
8409           SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
8410         }
8411       if (!nested_in_vect_loop)
8412         {
8413           /* Fill up to the number of vectors we need for the whole group.  */
8414           nivs = least_common_multiple (group_size,
8415                                         const_nunits) / const_nunits;
8416           vec_steps.reserve (nivs-ivn);
8417           for (; ivn < nivs; ++ivn)
8418             {
8419               SLP_TREE_VEC_STMTS (slp_node)
8420                 .quick_push (SLP_TREE_VEC_STMTS (slp_node)[0]);
8421               vec_steps.quick_push (vec_steps[0]);
8422             }
8423         }
8424
8425       /* Re-use IVs when we can.  We are generating further vector
8426          stmts by adding VF' * stride to the IVs generated above.  */
8427       if (ivn < nvects)
8428         {
8429           unsigned vfp
8430             = least_common_multiple (group_size, const_nunits) / group_size;
8431           tree lupdate_mul
8432             = build_vector_from_val (step_vectype,
8433                                      SCALAR_FLOAT_TYPE_P (stept)
8434                                      ? build_real_from_wide (stept,
8435                                                              vfp, UNSIGNED)
8436                                      : build_int_cstu (stept, vfp));
8437           for (; ivn < nvects; ++ivn)
8438             {
8439               gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
8440               tree def = gimple_get_lhs (iv);
8441               if (ivn < 2*nivs)
8442                 vec_steps[ivn - nivs]
8443                   = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
8444                                   vec_steps[ivn - nivs], lupdate_mul);
8445               gimple_seq stmts = NULL;
8446               def = gimple_convert (&stmts, step_vectype, def);
8447               def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
8448                                   def, vec_steps[ivn % nivs]);
8449               def = gimple_convert (&stmts, vectype, def);
8450               if (gimple_code (iv) == GIMPLE_PHI)
8451                 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8452               else
8453                 {
8454                   gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
8455                   gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
8456                 }
8457               SLP_TREE_VEC_STMTS (slp_node)
8458                 .quick_push (SSA_NAME_DEF_STMT (def));
8459             }
8460         }
8461
8462       new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts);
8463       gcc_assert (!new_bb);
8464
8465       return true;
8466     }
8467
8468   init_expr = vect_phi_initial_value (phi);
8469
8470   gimple_seq stmts = NULL;
8471   if (!nested_in_vect_loop)
8472     {
8473       /* Convert the initial value to the IV update type.  */
8474       tree new_type = TREE_TYPE (step_expr);
8475       init_expr = gimple_convert (&stmts, new_type, init_expr);
8476
8477       /* If we are using the loop mask to "peel" for alignment then we need
8478          to adjust the start value here.  */
8479       tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
8480       if (skip_niters != NULL_TREE)
8481         {
8482           if (FLOAT_TYPE_P (vectype))
8483             skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
8484                                         skip_niters);
8485           else
8486             skip_niters = gimple_convert (&stmts, new_type, skip_niters);
8487           tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
8488                                          skip_niters, step_expr);
8489           init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
8490                                     init_expr, skip_step);
8491         }
8492     }
8493
8494   if (stmts)
8495     {
8496       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
8497       gcc_assert (!new_bb);
8498     }
8499
8500   /* Create the vector that holds the initial_value of the induction.  */
8501   if (nested_in_vect_loop)
8502     {
8503       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
8504          been created during vectorization of previous stmts.  We obtain it
8505          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
8506       auto_vec<tree> vec_inits;
8507       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
8508                                      init_expr, &vec_inits);
8509       vec_init = vec_inits[0];
8510       /* If the initial value is not of proper type, convert it.  */
8511       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
8512         {
8513           new_stmt
8514             = gimple_build_assign (vect_get_new_ssa_name (vectype,
8515                                                           vect_simple_var,
8516                                                           "vec_iv_"),
8517                                    VIEW_CONVERT_EXPR,
8518                                    build1 (VIEW_CONVERT_EXPR, vectype,
8519                                            vec_init));
8520           vec_init = gimple_assign_lhs (new_stmt);
8521           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
8522                                                  new_stmt);
8523           gcc_assert (!new_bb);
8524         }
8525     }
8526   else
8527     {
8528       /* iv_loop is the loop to be vectorized. Create:
8529          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
8530       stmts = NULL;
8531       new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
8532
8533       unsigned HOST_WIDE_INT const_nunits;
8534       if (nunits.is_constant (&const_nunits))
8535         {
8536           tree_vector_builder elts (step_vectype, const_nunits, 1);
8537           elts.quick_push (new_name);
8538           for (i = 1; i < const_nunits; i++)
8539             {
8540               /* Create: new_name_i = new_name + step_expr  */
8541               new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
8542                                        new_name, step_expr);
8543               elts.quick_push (new_name);
8544             }
8545           /* Create a vector from [new_name_0, new_name_1, ...,
8546              new_name_nunits-1]  */
8547           vec_init = gimple_build_vector (&stmts, &elts);
8548         }
8549       else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
8550         /* Build the initial value directly from a VEC_SERIES_EXPR.  */
8551         vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
8552                                  new_name, step_expr);
8553       else
8554         {
8555           /* Build:
8556                 [base, base, base, ...]
8557                 + (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
8558           gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
8559           gcc_assert (flag_associative_math);
8560           tree index = build_index_vector (step_vectype, 0, 1);
8561           tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
8562                                                         new_name);
8563           tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
8564                                                         step_expr);
8565           vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
8566           vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
8567                                    vec_init, step_vec);
8568           vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
8569                                    vec_init, base_vec);
8570         }
8571       vec_init = gimple_convert (&stmts, vectype, vec_init);
8572
8573       if (stmts)
8574         {
8575           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
8576           gcc_assert (!new_bb);
8577         }
8578     }
8579
8580
8581   /* Create the vector that holds the step of the induction.  */
8582   if (nested_in_vect_loop)
8583     /* iv_loop is nested in the loop to be vectorized. Generate:
8584        vec_step = [S, S, S, S]  */
8585     new_name = step_expr;
8586   else
8587     {
8588       /* iv_loop is the loop to be vectorized. Generate:
8589           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
8590       gimple_seq seq = NULL;
8591       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
8592         {
8593           expr = build_int_cst (integer_type_node, vf);
8594           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
8595         }
8596       else
8597         expr = build_int_cst (TREE_TYPE (step_expr), vf);
8598       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
8599                                expr, step_expr);
8600       if (seq)
8601         {
8602           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
8603           gcc_assert (!new_bb);
8604         }
8605     }
8606
8607   t = unshare_expr (new_name);
8608   gcc_assert (CONSTANT_CLASS_P (new_name)
8609               || TREE_CODE (new_name) == SSA_NAME);
8610   new_vec = build_vector_from_val (step_vectype, t);
8611   vec_step = vect_init_vector (loop_vinfo, stmt_info,
8612                                new_vec, step_vectype, NULL);
8613
8614
8615   /* Create the following def-use cycle:
8616      loop prolog:
8617          vec_init = ...
8618          vec_step = ...
8619      loop:
8620          vec_iv = PHI <vec_init, vec_loop>
8621          ...
8622          STMT
8623          ...
8624          vec_loop = vec_iv + vec_step;  */
8625
8626   /* Create the induction-phi that defines the induction-operand.  */
8627   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
8628   induction_phi = create_phi_node (vec_dest, iv_loop->header);
8629   induc_def = PHI_RESULT (induction_phi);
8630
8631   /* Create the iv update inside the loop  */
8632   stmts = NULL;
8633   vec_def = gimple_convert (&stmts, step_vectype, induc_def);
8634   vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
8635   vec_def = gimple_convert (&stmts, vectype, vec_def);
8636   gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8637   new_stmt = SSA_NAME_DEF_STMT (vec_def);
8638
8639   /* Set the arguments of the phi node:  */
8640   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
8641   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
8642                UNKNOWN_LOCATION);
8643
8644   STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
8645   *vec_stmt = induction_phi;
8646
8647   /* In case that vectorization factor (VF) is bigger than the number
8648      of elements that we can fit in a vectype (nunits), we have to generate
8649      more than one vector stmt - i.e - we need to "unroll" the
8650      vector stmt by a factor VF/nunits.  For more details see documentation
8651      in vectorizable_operation.  */
8652
8653   if (ncopies > 1)
8654     {
8655       gimple_seq seq = NULL;
8656       /* FORNOW. This restriction should be relaxed.  */
8657       gcc_assert (!nested_in_vect_loop);
8658
8659       /* Create the vector that holds the step of the induction.  */
8660       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
8661         {
8662           expr = build_int_cst (integer_type_node, nunits);
8663           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
8664         }
8665       else
8666         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
8667       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
8668                                expr, step_expr);
8669       if (seq)
8670         {
8671           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
8672           gcc_assert (!new_bb);
8673         }
8674
8675       t = unshare_expr (new_name);
8676       gcc_assert (CONSTANT_CLASS_P (new_name)
8677                   || TREE_CODE (new_name) == SSA_NAME);
8678       new_vec = build_vector_from_val (step_vectype, t);
8679       vec_step = vect_init_vector (loop_vinfo, stmt_info,
8680                                    new_vec, step_vectype, NULL);
8681
8682       vec_def = induc_def;
8683       for (i = 1; i < ncopies; i++)
8684         {
8685           /* vec_i = vec_prev + vec_step  */
8686           gimple_seq stmts = NULL;
8687           vec_def = gimple_convert (&stmts, step_vectype, vec_def);
8688           vec_def = gimple_build (&stmts,
8689                                   PLUS_EXPR, step_vectype, vec_def, vec_step);
8690           vec_def = gimple_convert (&stmts, vectype, vec_def);
8691
8692           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8693           new_stmt = SSA_NAME_DEF_STMT (vec_def);
8694           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
8695         }
8696     }
8697
8698   if (dump_enabled_p ())
8699     dump_printf_loc (MSG_NOTE, vect_location,
8700                      "transform induction: created def-use cycle: %G%G",
8701                      induction_phi, SSA_NAME_DEF_STMT (vec_def));
8702
8703   return true;
8704 }
8705
8706 /* Function vectorizable_live_operation.
8707
8708    STMT_INFO computes a value that is used outside the loop.  Check if
8709    it can be supported.  */
8710
8711 bool
8712 vectorizable_live_operation (vec_info *vinfo,
8713                              stmt_vec_info stmt_info,
8714                              gimple_stmt_iterator *gsi,
8715                              slp_tree slp_node, slp_instance slp_node_instance,
8716                              int slp_index, bool vec_stmt_p,
8717                              stmt_vector_for_cost *cost_vec)
8718 {
8719   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
8720   imm_use_iterator imm_iter;
8721   tree lhs, lhs_type, bitsize;
8722   tree vectype = (slp_node
8723                   ? SLP_TREE_VECTYPE (slp_node)
8724                   : STMT_VINFO_VECTYPE (stmt_info));
8725   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8726   int ncopies;
8727   gimple *use_stmt;
8728   auto_vec<tree> vec_oprnds;
8729   int vec_entry = 0;
8730   poly_uint64 vec_index = 0;
8731
8732   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
8733
8734   /* If a stmt of a reduction is live, vectorize it via
8735      vect_create_epilog_for_reduction.  vectorizable_reduction assessed
8736      validity so just trigger the transform here.  */
8737   if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
8738     {
8739       if (!vec_stmt_p)
8740         return true;
8741       if (slp_node)
8742         {
8743           /* For reduction chains the meta-info is attached to
8744              the group leader.  */
8745           if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
8746             stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
8747           /* For SLP reductions we vectorize the epilogue for
8748              all involved stmts together.  */
8749           else if (slp_index != 0)
8750             return true;
8751           else
8752             /* For SLP reductions the meta-info is attached to
8753                the representative.  */
8754             stmt_info = SLP_TREE_REPRESENTATIVE (slp_node);
8755         }
8756       stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8757       gcc_assert (reduc_info->is_reduc_info);
8758       if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
8759           || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
8760         return true;
8761       vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
8762                                         slp_node_instance);
8763       return true;
8764     }
8765
8766   /* If STMT is not relevant and it is a simple assignment and its inputs are
8767      invariant then it can remain in place, unvectorized.  The original last
8768      scalar value that it computes will be used.  */
8769   if (!STMT_VINFO_RELEVANT_P (stmt_info))
8770     {
8771       gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
8772       if (dump_enabled_p ())
8773         dump_printf_loc (MSG_NOTE, vect_location,
8774                          "statement is simple and uses invariant.  Leaving in "
8775                          "place.\n");
8776       return true;
8777     }
8778
8779   if (slp_node)
8780     ncopies = 1;
8781   else
8782     ncopies = vect_get_num_copies (loop_vinfo, vectype);
8783
8784   if (slp_node)
8785     {
8786       gcc_assert (slp_index >= 0);
8787
8788       /* Get the last occurrence of the scalar index from the concatenation of
8789          all the slp vectors. Calculate which slp vector it is and the index
8790          within.  */
8791       int num_scalar = SLP_TREE_LANES (slp_node);
8792       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8793       poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
8794
8795       /* Calculate which vector contains the result, and which lane of
8796          that vector we need.  */
8797       if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
8798         {
8799           if (dump_enabled_p ())
8800             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8801                              "Cannot determine which vector holds the"
8802                              " final result.\n");
8803           return false;
8804         }
8805     }
8806
8807   if (!vec_stmt_p)
8808     {
8809       /* No transformation required.  */
8810       if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
8811         {
8812           if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
8813                                                OPTIMIZE_FOR_SPEED))
8814             {
8815               if (dump_enabled_p ())
8816                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8817                                  "can't operate on partial vectors "
8818                                  "because the target doesn't support extract "
8819                                  "last reduction.\n");
8820               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8821             }
8822           else if (slp_node)
8823             {
8824               if (dump_enabled_p ())
8825                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8826                                  "can't operate on partial vectors "
8827                                  "because an SLP statement is live after "
8828                                  "the loop.\n");
8829               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8830             }
8831           else if (ncopies > 1)
8832             {
8833               if (dump_enabled_p ())
8834                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8835                                  "can't operate on partial vectors "
8836                                  "because ncopies is greater than 1.\n");
8837               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8838             }
8839           else
8840             {
8841               gcc_assert (ncopies == 1 && !slp_node);
8842               vect_record_loop_mask (loop_vinfo,
8843                                      &LOOP_VINFO_MASKS (loop_vinfo),
8844                                      1, vectype, NULL);
8845             }
8846         }
8847       /* ???  Enable for loop costing as well.  */
8848       if (!loop_vinfo)
8849         record_stmt_cost (cost_vec, 1, vec_to_scalar, stmt_info, NULL_TREE,
8850                           0, vect_epilogue);
8851       return true;
8852     }
8853
8854   /* Use the lhs of the original scalar statement.  */
8855   gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
8856   if (dump_enabled_p ())
8857     dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
8858                      "stmt %G", stmt);
8859
8860   lhs = gimple_get_lhs (stmt);
8861   lhs_type = TREE_TYPE (lhs);
8862
8863   bitsize = vector_element_bits_tree (vectype);
8864
8865   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
8866   tree vec_lhs, bitstart;
8867   gimple *vec_stmt;
8868   if (slp_node)
8869     {
8870       gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8871
8872       /* Get the correct slp vectorized stmt.  */
8873       vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry];
8874       vec_lhs = gimple_get_lhs (vec_stmt);
8875
8876       /* Get entry to use.  */
8877       bitstart = bitsize_int (vec_index);
8878       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
8879     }
8880   else
8881     {
8882       /* For multiple copies, get the last copy.  */
8883       vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info).last ();
8884       vec_lhs = gimple_get_lhs (vec_stmt);
8885
8886       /* Get the last lane in the vector.  */
8887       bitstart = int_const_binop (MULT_EXPR, bitsize, bitsize_int (nunits - 1));
8888     }
8889
8890   if (loop_vinfo)
8891     {
8892       /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
8893          requirement, insert one phi node for it.  It looks like:
8894            loop;
8895          BB:
8896            # lhs' = PHI <lhs>
8897          ==>
8898            loop;
8899          BB:
8900            # vec_lhs' = PHI <vec_lhs>
8901            new_tree = lane_extract <vec_lhs', ...>;
8902            lhs' = new_tree;  */
8903
8904       class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8905       basic_block exit_bb = single_exit (loop)->dest;
8906       gcc_assert (single_pred_p (exit_bb));
8907
8908       tree vec_lhs_phi = copy_ssa_name (vec_lhs);
8909       gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
8910       SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, vec_lhs);
8911
8912       gimple_seq stmts = NULL;
8913       tree new_tree;
8914       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8915         {
8916           /* Emit:
8917
8918                SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
8919
8920              where VEC_LHS is the vectorized live-out result and MASK is
8921              the loop mask for the final iteration.  */
8922           gcc_assert (ncopies == 1 && !slp_node);
8923           tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
8924           tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
8925                                           1, vectype, 0);
8926           tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
8927                                           mask, vec_lhs_phi);
8928
8929           /* Convert the extracted vector element to the scalar type.  */
8930           new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
8931         }
8932       else
8933         {
8934           tree bftype = TREE_TYPE (vectype);
8935           if (VECTOR_BOOLEAN_TYPE_P (vectype))
8936             bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8937           new_tree = build3 (BIT_FIELD_REF, bftype,
8938                              vec_lhs_phi, bitsize, bitstart);
8939           new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8940                                            &stmts, true, NULL_TREE);
8941         }
8942
8943       if (stmts)
8944         {
8945           gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb);
8946           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
8947
8948           /* Remove existing phi from lhs and create one copy from new_tree.  */
8949           tree lhs_phi = NULL_TREE;
8950           gimple_stmt_iterator gsi;
8951           for (gsi = gsi_start_phis (exit_bb);
8952                !gsi_end_p (gsi); gsi_next (&gsi))
8953             {
8954               gimple *phi = gsi_stmt (gsi);
8955               if ((gimple_phi_arg_def (phi, 0) == lhs))
8956                 {
8957                   remove_phi_node (&gsi, false);
8958                   lhs_phi = gimple_phi_result (phi);
8959                   gimple *copy = gimple_build_assign (lhs_phi, new_tree);
8960                   gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
8961                   break;
8962                 }
8963             }
8964         }
8965
8966       /* Replace use of lhs with newly computed result.  If the use stmt is a
8967          single arg PHI, just replace all uses of PHI result.  It's necessary
8968          because lcssa PHI defining lhs may be before newly inserted stmt.  */
8969       use_operand_p use_p;
8970       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8971         if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
8972             && !is_gimple_debug (use_stmt))
8973           {
8974             if (gimple_code (use_stmt) == GIMPLE_PHI
8975                 && gimple_phi_num_args (use_stmt) == 1)
8976               {
8977                 replace_uses_by (gimple_phi_result (use_stmt), new_tree);
8978               }
8979             else
8980               {
8981                 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8982                     SET_USE (use_p, new_tree);
8983               }
8984             update_stmt (use_stmt);
8985           }
8986     }
8987   else
8988     {
8989       /* For basic-block vectorization simply insert the lane-extraction.  */
8990       tree bftype = TREE_TYPE (vectype);
8991       if (VECTOR_BOOLEAN_TYPE_P (vectype))
8992         bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8993       tree new_tree = build3 (BIT_FIELD_REF, bftype,
8994                               vec_lhs, bitsize, bitstart);
8995       gimple_seq stmts = NULL;
8996       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8997                                        &stmts, true, NULL_TREE);
8998       if (TREE_CODE (new_tree) == SSA_NAME
8999           && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
9000         SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1;
9001       if (is_a <gphi *> (vec_stmt))
9002         {
9003           gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt));
9004           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9005         }
9006       else
9007         {
9008           gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
9009           gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
9010         }
9011
9012       /* Replace use of lhs with newly computed result.  If the use stmt is a
9013          single arg PHI, just replace all uses of PHI result.  It's necessary
9014          because lcssa PHI defining lhs may be before newly inserted stmt.  */
9015       use_operand_p use_p;
9016       stmt_vec_info use_stmt_info;
9017       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
9018         if (!is_gimple_debug (use_stmt)
9019             && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
9020                 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
9021           {
9022             /* ???  This can happen when the live lane ends up being
9023                used in a vector construction code-generated by an
9024                external SLP node (and code-generation for that already
9025                happened).  See gcc.dg/vect/bb-slp-47.c.
9026                Doing this is what would happen if that vector CTOR
9027                were not code-generated yet so it is not too bad.
9028                ???  In fact we'd likely want to avoid this situation
9029                in the first place.  */
9030             if (TREE_CODE (new_tree) == SSA_NAME
9031                 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
9032                 && gimple_code (use_stmt) != GIMPLE_PHI
9033                 && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
9034                                                 use_stmt))
9035               {
9036                 enum tree_code code = gimple_assign_rhs_code (use_stmt);
9037                 gcc_assert (code == CONSTRUCTOR
9038                             || code == VIEW_CONVERT_EXPR
9039                             || CONVERT_EXPR_CODE_P (code));
9040                 if (dump_enabled_p ())
9041                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9042                                    "Using original scalar computation for "
9043                                    "live lane because use preceeds vector "
9044                                    "def\n");
9045                 continue;
9046               }
9047             /* ???  It can also happen that we end up pulling a def into
9048                a loop where replacing out-of-loop uses would require
9049                a new LC SSA PHI node.  Retain the original scalar in
9050                those cases as well.  PR98064.  */
9051             if (TREE_CODE (new_tree) == SSA_NAME
9052                 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
9053                 && (gimple_bb (use_stmt)->loop_father
9054                     != gimple_bb (vec_stmt)->loop_father)
9055                 && !flow_loop_nested_p (gimple_bb (vec_stmt)->loop_father,
9056                                         gimple_bb (use_stmt)->loop_father))
9057               {
9058                 if (dump_enabled_p ())
9059                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9060                                    "Using original scalar computation for "
9061                                    "live lane because there is an out-of-loop "
9062                                    "definition for it\n");
9063                 continue;
9064               }
9065             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
9066               SET_USE (use_p, new_tree);
9067             update_stmt (use_stmt);
9068           }
9069     }
9070
9071   return true;
9072 }
9073
9074 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO.  */
9075
9076 static void
9077 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
9078 {
9079   ssa_op_iter op_iter;
9080   imm_use_iterator imm_iter;
9081   def_operand_p def_p;
9082   gimple *ustmt;
9083
9084   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
9085     {
9086       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
9087         {
9088           basic_block bb;
9089
9090           if (!is_gimple_debug (ustmt))
9091             continue;
9092
9093           bb = gimple_bb (ustmt);
9094
9095           if (!flow_bb_inside_loop_p (loop, bb))
9096             {
9097               if (gimple_debug_bind_p (ustmt))
9098                 {
9099                   if (dump_enabled_p ())
9100                     dump_printf_loc (MSG_NOTE, vect_location,
9101                                      "killing debug use\n");
9102
9103                   gimple_debug_bind_reset_value (ustmt);
9104                   update_stmt (ustmt);
9105                 }
9106               else
9107                 gcc_unreachable ();
9108             }
9109         }
9110     }
9111 }
9112
9113 /* Given loop represented by LOOP_VINFO, return true if computation of
9114    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
9115    otherwise.  */
9116
9117 static bool
9118 loop_niters_no_overflow (loop_vec_info loop_vinfo)
9119 {
9120   /* Constant case.  */
9121   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
9122     {
9123       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
9124       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
9125
9126       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
9127       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
9128       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
9129         return true;
9130     }
9131
9132   widest_int max;
9133   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9134   /* Check the upper bound of loop niters.  */
9135   if (get_max_loop_iterations (loop, &max))
9136     {
9137       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
9138       signop sgn = TYPE_SIGN (type);
9139       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
9140       if (max < type_max)
9141         return true;
9142     }
9143   return false;
9144 }
9145
9146 /* Return a mask type with half the number of elements as OLD_TYPE,
9147    given that it should have mode NEW_MODE.  */
9148
9149 tree
9150 vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
9151 {
9152   poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
9153   return build_truth_vector_type_for_mode (nunits, new_mode);
9154 }
9155
9156 /* Return a mask type with twice as many elements as OLD_TYPE,
9157    given that it should have mode NEW_MODE.  */
9158
9159 tree
9160 vect_double_mask_nunits (tree old_type, machine_mode new_mode)
9161 {
9162   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
9163   return build_truth_vector_type_for_mode (nunits, new_mode);
9164 }
9165
9166 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
9167    contain a sequence of NVECTORS masks that each control a vector of type
9168    VECTYPE.  If SCALAR_MASK is nonnull, the fully-masked loop would AND
9169    these vector masks with the vector version of SCALAR_MASK.  */
9170
9171 void
9172 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
9173                        unsigned int nvectors, tree vectype, tree scalar_mask)
9174 {
9175   gcc_assert (nvectors != 0);
9176   if (masks->length () < nvectors)
9177     masks->safe_grow_cleared (nvectors, true);
9178   rgroup_controls *rgm = &(*masks)[nvectors - 1];
9179   /* The number of scalars per iteration and the number of vectors are
9180      both compile-time constants.  */
9181   unsigned int nscalars_per_iter
9182     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
9183                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
9184
9185   if (scalar_mask)
9186     {
9187       scalar_cond_masked_key cond (scalar_mask, nvectors);
9188       loop_vinfo->scalar_cond_masked_set.add (cond);
9189     }
9190
9191   if (rgm->max_nscalars_per_iter < nscalars_per_iter)
9192     {
9193       rgm->max_nscalars_per_iter = nscalars_per_iter;
9194       rgm->type = truth_type_for (vectype);
9195       rgm->factor = 1;
9196     }
9197 }
9198
9199 /* Given a complete set of masks MASKS, extract mask number INDEX
9200    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
9201    where 0 <= INDEX < NVECTORS.  Insert any set-up statements before GSI.
9202
9203    See the comment above vec_loop_masks for more details about the mask
9204    arrangement.  */
9205
9206 tree
9207 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
9208                     unsigned int nvectors, tree vectype, unsigned int index)
9209 {
9210   rgroup_controls *rgm = &(*masks)[nvectors - 1];
9211   tree mask_type = rgm->type;
9212
9213   /* Populate the rgroup's mask array, if this is the first time we've
9214      used it.  */
9215   if (rgm->controls.is_empty ())
9216     {
9217       rgm->controls.safe_grow_cleared (nvectors, true);
9218       for (unsigned int i = 0; i < nvectors; ++i)
9219         {
9220           tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
9221           /* Provide a dummy definition until the real one is available.  */
9222           SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
9223           rgm->controls[i] = mask;
9224         }
9225     }
9226
9227   tree mask = rgm->controls[index];
9228   if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
9229                 TYPE_VECTOR_SUBPARTS (vectype)))
9230     {
9231       /* A loop mask for data type X can be reused for data type Y
9232          if X has N times more elements than Y and if Y's elements
9233          are N times bigger than X's.  In this case each sequence
9234          of N elements in the loop mask will be all-zero or all-one.
9235          We can then view-convert the mask so that each sequence of
9236          N elements is replaced by a single element.  */
9237       gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
9238                               TYPE_VECTOR_SUBPARTS (vectype)));
9239       gimple_seq seq = NULL;
9240       mask_type = truth_type_for (vectype);
9241       mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
9242       if (seq)
9243         gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
9244     }
9245   return mask;
9246 }
9247
9248 /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
9249    lengths for controlling an operation on VECTYPE.  The operation splits
9250    each element of VECTYPE into FACTOR separate subelements, measuring the
9251    length as a number of these subelements.  */
9252
9253 void
9254 vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
9255                       unsigned int nvectors, tree vectype, unsigned int factor)
9256 {
9257   gcc_assert (nvectors != 0);
9258   if (lens->length () < nvectors)
9259     lens->safe_grow_cleared (nvectors, true);
9260   rgroup_controls *rgl = &(*lens)[nvectors - 1];
9261
9262   /* The number of scalars per iteration, scalar occupied bytes and
9263      the number of vectors are both compile-time constants.  */
9264   unsigned int nscalars_per_iter
9265     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
9266                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
9267
9268   if (rgl->max_nscalars_per_iter < nscalars_per_iter)
9269     {
9270       /* For now, we only support cases in which all loads and stores fall back
9271          to VnQI or none do.  */
9272       gcc_assert (!rgl->max_nscalars_per_iter
9273                   || (rgl->factor == 1 && factor == 1)
9274                   || (rgl->max_nscalars_per_iter * rgl->factor
9275                       == nscalars_per_iter * factor));
9276       rgl->max_nscalars_per_iter = nscalars_per_iter;
9277       rgl->type = vectype;
9278       rgl->factor = factor;
9279     }
9280 }
9281
9282 /* Given a complete set of length LENS, extract length number INDEX for an
9283    rgroup that operates on NVECTORS vectors, where 0 <= INDEX < NVECTORS.  */
9284
9285 tree
9286 vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
9287                    unsigned int nvectors, unsigned int index)
9288 {
9289   rgroup_controls *rgl = &(*lens)[nvectors - 1];
9290   bool use_bias_adjusted_len =
9291     LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
9292
9293   /* Populate the rgroup's len array, if this is the first time we've
9294      used it.  */
9295   if (rgl->controls.is_empty ())
9296     {
9297       rgl->controls.safe_grow_cleared (nvectors, true);
9298       for (unsigned int i = 0; i < nvectors; ++i)
9299         {
9300           tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
9301           gcc_assert (len_type != NULL_TREE);
9302
9303           tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
9304
9305           /* Provide a dummy definition until the real one is available.  */
9306           SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
9307           rgl->controls[i] = len;
9308
9309           if (use_bias_adjusted_len)
9310             {
9311               gcc_assert (i == 0);
9312               tree adjusted_len =
9313                 make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
9314               SSA_NAME_DEF_STMT (adjusted_len) = gimple_build_nop ();
9315               rgl->bias_adjusted_ctrl = adjusted_len;
9316             }
9317         }
9318     }
9319
9320   if (use_bias_adjusted_len)
9321     return rgl->bias_adjusted_ctrl;
9322   else
9323     return rgl->controls[index];
9324 }
9325
9326 /* Scale profiling counters by estimation for LOOP which is vectorized
9327    by factor VF.  */
9328
9329 static void
9330 scale_profile_for_vect_loop (class loop *loop, unsigned vf)
9331 {
9332   edge preheader = loop_preheader_edge (loop);
9333   /* Reduce loop iterations by the vectorization factor.  */
9334   gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
9335   profile_count freq_h = loop->header->count, freq_e = preheader->count ();
9336
9337   if (freq_h.nonzero_p ())
9338     {
9339       profile_probability p;
9340
9341       /* Avoid dropping loop body profile counter to 0 because of zero count
9342          in loop's preheader.  */
9343       if (!(freq_e == profile_count::zero ()))
9344         freq_e = freq_e.force_nonzero ();
9345       p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
9346       scale_loop_frequencies (loop, p);
9347     }
9348
9349   edge exit_e = single_exit (loop);
9350   exit_e->probability = profile_probability::always ()
9351                                  .apply_scale (1, new_est_niter + 1);
9352
9353   edge exit_l = single_pred_edge (loop->latch);
9354   profile_probability prob = exit_l->probability;
9355   exit_l->probability = exit_e->probability.invert ();
9356   if (prob.initialized_p () && exit_l->probability.initialized_p ())
9357     scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
9358 }
9359
9360 /* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
9361    latch edge values originally defined by it.  */
9362
9363 static void
9364 maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo,
9365                                      stmt_vec_info def_stmt_info)
9366 {
9367   tree def = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
9368   if (!def || TREE_CODE (def) != SSA_NAME)
9369     return;
9370   stmt_vec_info phi_info;
9371   imm_use_iterator iter;
9372   use_operand_p use_p;
9373   FOR_EACH_IMM_USE_FAST (use_p, iter, def)
9374     if (gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p)))
9375       if (gimple_bb (phi)->loop_father->header == gimple_bb (phi)
9376           && (phi_info = loop_vinfo->lookup_stmt (phi))
9377           && STMT_VINFO_RELEVANT_P (phi_info)
9378           && VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info))
9379           && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION
9380           && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION)
9381         {
9382           loop_p loop = gimple_bb (phi)->loop_father;
9383           edge e = loop_latch_edge (loop);
9384           if (PHI_ARG_DEF_FROM_EDGE (phi, e) == def)
9385             {
9386               vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
9387               vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
9388               gcc_assert (phi_defs.length () == latch_defs.length ());
9389               for (unsigned i = 0; i < phi_defs.length (); ++i)
9390                 add_phi_arg (as_a <gphi *> (phi_defs[i]),
9391                              gimple_get_lhs (latch_defs[i]), e,
9392                              gimple_phi_arg_location (phi, e->dest_idx));
9393             }
9394         }
9395 }
9396
9397 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
9398    When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
9399    stmt_vec_info.  */
9400
9401 static bool
9402 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
9403                           gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
9404 {
9405   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9406   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9407
9408   if (dump_enabled_p ())
9409     dump_printf_loc (MSG_NOTE, vect_location,
9410                      "------>vectorizing statement: %G", stmt_info->stmt);
9411
9412   if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
9413     vect_loop_kill_debug_uses (loop, stmt_info);
9414
9415   if (!STMT_VINFO_RELEVANT_P (stmt_info)
9416       && !STMT_VINFO_LIVE_P (stmt_info))
9417     return false;
9418
9419   if (STMT_VINFO_VECTYPE (stmt_info))
9420     {
9421       poly_uint64 nunits
9422         = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
9423       if (!STMT_SLP_TYPE (stmt_info)
9424           && maybe_ne (nunits, vf)
9425           && dump_enabled_p ())
9426         /* For SLP VF is set according to unrolling factor, and not
9427            to vector size, hence for SLP this print is not valid.  */
9428         dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
9429     }
9430
9431   /* Pure SLP statements have already been vectorized.  We still need
9432      to apply loop vectorization to hybrid SLP statements.  */
9433   if (PURE_SLP_STMT (stmt_info))
9434     return false;
9435
9436   if (dump_enabled_p ())
9437     dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
9438
9439   if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL))
9440     *seen_store = stmt_info;
9441
9442   return true;
9443 }
9444
9445 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
9446    in the hash_map with its corresponding values.  */
9447
9448 static tree
9449 find_in_mapping (tree t, void *context)
9450 {
9451   hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
9452
9453   tree *value = mapping->get (t);
9454   return value ? *value : t;
9455 }
9456
9457 /* Update EPILOGUE's loop_vec_info.  EPILOGUE was constructed as a copy of the
9458    original loop that has now been vectorized.
9459
9460    The inits of the data_references need to be advanced with the number of
9461    iterations of the main loop.  This has been computed in vect_do_peeling and
9462    is stored in parameter ADVANCE.  We first restore the data_references
9463    initial offset with the values recored in ORIG_DRS_INIT.
9464
9465    Since the loop_vec_info of this EPILOGUE was constructed for the original
9466    loop, its stmt_vec_infos all point to the original statements.  These need
9467    to be updated to point to their corresponding copies as well as the SSA_NAMES
9468    in their PATTERN_DEF_SEQs and RELATED_STMTs.
9469
9470    The data_reference's connections also need to be updated.  Their
9471    corresponding dr_vec_info need to be reconnected to the EPILOGUE's
9472    stmt_vec_infos, their statements need to point to their corresponding copy,
9473    if they are gather loads or scatter stores then their reference needs to be
9474    updated to point to its corresponding copy and finally we set
9475    'base_misaligned' to false as we have already peeled for alignment in the
9476    prologue of the main loop.  */
9477
9478 static void
9479 update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
9480 {
9481   loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
9482   auto_vec<gimple *> stmt_worklist;
9483   hash_map<tree,tree> mapping;
9484   gimple *orig_stmt, *new_stmt;
9485   gimple_stmt_iterator epilogue_gsi;
9486   gphi_iterator epilogue_phi_gsi;
9487   stmt_vec_info stmt_vinfo = NULL, related_vinfo;
9488   basic_block *epilogue_bbs = get_loop_body (epilogue);
9489   unsigned i;
9490
9491   free (LOOP_VINFO_BBS (epilogue_vinfo));
9492   LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
9493
9494   /* Advance data_reference's with the number of iterations of the previous
9495      loop and its prologue.  */
9496   vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
9497
9498
9499   /* The EPILOGUE loop is a copy of the original loop so they share the same
9500      gimple UIDs.  In this loop we update the loop_vec_info of the EPILOGUE to
9501      point to the copied statements.  We also create a mapping of all LHS' in
9502      the original loop and all the LHS' in the EPILOGUE and create worklists to
9503      update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs.  */
9504   for (unsigned i = 0; i < epilogue->num_nodes; ++i)
9505     {
9506       for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
9507            !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
9508         {
9509           new_stmt = epilogue_phi_gsi.phi ();
9510
9511           gcc_assert (gimple_uid (new_stmt) > 0);
9512           stmt_vinfo
9513             = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
9514
9515           orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
9516           STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
9517
9518           mapping.put (gimple_phi_result (orig_stmt),
9519                        gimple_phi_result (new_stmt));
9520           /* PHI nodes can not have patterns or related statements.  */
9521           gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
9522                       && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
9523         }
9524
9525       for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
9526            !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
9527         {
9528           new_stmt = gsi_stmt (epilogue_gsi);
9529           if (is_gimple_debug (new_stmt))
9530             continue;
9531
9532           gcc_assert (gimple_uid (new_stmt) > 0);
9533           stmt_vinfo
9534             = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
9535
9536           orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
9537           STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
9538
9539           if (tree old_lhs = gimple_get_lhs (orig_stmt))
9540             mapping.put (old_lhs, gimple_get_lhs (new_stmt));
9541
9542           if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
9543             {
9544               gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
9545               for (gimple_stmt_iterator gsi = gsi_start (seq);
9546                    !gsi_end_p (gsi); gsi_next (&gsi))
9547                 stmt_worklist.safe_push (gsi_stmt (gsi));
9548             }
9549
9550           related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
9551           if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
9552             {
9553               gimple *stmt = STMT_VINFO_STMT (related_vinfo);
9554               stmt_worklist.safe_push (stmt);
9555               /* Set BB such that the assert in
9556                 'get_initial_def_for_reduction' is able to determine that
9557                 the BB of the related stmt is inside this loop.  */
9558               gimple_set_bb (stmt,
9559                              gimple_bb (new_stmt));
9560               related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
9561               gcc_assert (related_vinfo == NULL
9562                           || related_vinfo == stmt_vinfo);
9563             }
9564         }
9565     }
9566
9567   /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
9568      using the original main loop and thus need to be updated to refer to the
9569      cloned variables used in the epilogue.  */
9570   for (unsigned i = 0; i < stmt_worklist.length (); ++i)
9571     {
9572       gimple *stmt = stmt_worklist[i];
9573       tree *new_op;
9574
9575       for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
9576         {
9577           tree op = gimple_op (stmt, j);
9578           if ((new_op = mapping.get(op)))
9579             gimple_set_op (stmt, j, *new_op);
9580           else
9581             {
9582               /* PR92429: The last argument of simplify_replace_tree disables
9583                  folding when replacing arguments.  This is required as
9584                  otherwise you might end up with different statements than the
9585                  ones analyzed in vect_loop_analyze, leading to different
9586                  vectorization.  */
9587               op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
9588                                           &find_in_mapping, &mapping, false);
9589               gimple_set_op (stmt, j, op);
9590             }
9591         }
9592     }
9593
9594   struct data_reference *dr;
9595   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
9596   FOR_EACH_VEC_ELT (datarefs, i, dr)
9597     {
9598       orig_stmt = DR_STMT (dr);
9599       gcc_assert (gimple_uid (orig_stmt) > 0);
9600       stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
9601       /* Data references for gather loads and scatter stores do not use the
9602          updated offset we set using ADVANCE.  Instead we have to make sure the
9603          reference in the data references point to the corresponding copy of
9604          the original in the epilogue.  */
9605       if (STMT_VINFO_MEMORY_ACCESS_TYPE (vect_stmt_to_vectorize (stmt_vinfo))
9606           == VMAT_GATHER_SCATTER)
9607         {
9608           DR_REF (dr)
9609             = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
9610                                      &find_in_mapping, &mapping);
9611           DR_BASE_ADDRESS (dr)
9612             = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
9613                                      &find_in_mapping, &mapping);
9614         }
9615       DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
9616       stmt_vinfo->dr_aux.stmt = stmt_vinfo;
9617       /* The vector size of the epilogue is smaller than that of the main loop
9618          so the alignment is either the same or lower. This means the dr will
9619          thus by definition be aligned.  */
9620       STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false;
9621     }
9622
9623   epilogue_vinfo->shared->datarefs_copy.release ();
9624   epilogue_vinfo->shared->save_datarefs ();
9625 }
9626
9627 /* Function vect_transform_loop.
9628
9629    The analysis phase has determined that the loop is vectorizable.
9630    Vectorize the loop - created vectorized stmts to replace the scalar
9631    stmts in the loop, and update the loop exit condition.
9632    Returns scalar epilogue loop if any.  */
9633
9634 class loop *
9635 vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
9636 {
9637   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9638   class loop *epilogue = NULL;
9639   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
9640   int nbbs = loop->num_nodes;
9641   int i;
9642   tree niters_vector = NULL_TREE;
9643   tree step_vector = NULL_TREE;
9644   tree niters_vector_mult_vf = NULL_TREE;
9645   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9646   unsigned int lowest_vf = constant_lower_bound (vf);
9647   gimple *stmt;
9648   bool check_profitability = false;
9649   unsigned int th;
9650
9651   DUMP_VECT_SCOPE ("vec_transform_loop");
9652
9653   loop_vinfo->shared->check_datarefs ();
9654
9655   /* Use the more conservative vectorization threshold.  If the number
9656      of iterations is constant assume the cost check has been performed
9657      by our caller.  If the threshold makes all loops profitable that
9658      run at least the (estimated) vectorization factor number of times
9659      checking is pointless, too.  */
9660   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
9661   if (vect_apply_runtime_profitability_check_p (loop_vinfo))
9662     {
9663       if (dump_enabled_p ())
9664         dump_printf_loc (MSG_NOTE, vect_location,
9665                          "Profitability threshold is %d loop iterations.\n",
9666                          th);
9667       check_profitability = true;
9668     }
9669
9670   /* Make sure there exists a single-predecessor exit bb.  Do this before
9671      versioning.   */
9672   edge e = single_exit (loop);
9673   if (! single_pred_p (e->dest))
9674     {
9675       split_loop_exit_edge (e, true);
9676       if (dump_enabled_p ())
9677         dump_printf (MSG_NOTE, "split exit edge\n");
9678     }
9679
9680   /* Version the loop first, if required, so the profitability check
9681      comes first.  */
9682
9683   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
9684     {
9685       class loop *sloop
9686         = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
9687       sloop->force_vectorize = false;
9688       check_profitability = false;
9689     }
9690
9691   /* Make sure there exists a single-predecessor exit bb also on the
9692      scalar loop copy.  Do this after versioning but before peeling
9693      so CFG structure is fine for both scalar and if-converted loop
9694      to make slpeel_duplicate_current_defs_from_edges face matched
9695      loop closed PHI nodes on the exit.  */
9696   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
9697     {
9698       e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
9699       if (! single_pred_p (e->dest))
9700         {
9701           split_loop_exit_edge (e, true);
9702           if (dump_enabled_p ())
9703             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
9704         }
9705     }
9706
9707   tree niters = vect_build_loop_niters (loop_vinfo);
9708   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
9709   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
9710   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
9711   tree advance;
9712   drs_init_vec orig_drs_init;
9713
9714   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
9715                               &step_vector, &niters_vector_mult_vf, th,
9716                               check_profitability, niters_no_overflow,
9717                               &advance);
9718
9719   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
9720       && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
9721     scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
9722                             LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
9723
9724   if (niters_vector == NULL_TREE)
9725     {
9726       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
9727           && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
9728           && known_eq (lowest_vf, vf))
9729         {
9730           niters_vector
9731             = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
9732                              LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
9733           step_vector = build_one_cst (TREE_TYPE (niters));
9734         }
9735       else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
9736         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
9737                                      &step_vector, niters_no_overflow);
9738       else
9739         /* vect_do_peeling subtracted the number of peeled prologue
9740            iterations from LOOP_VINFO_NITERS.  */
9741         vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
9742                                      &niters_vector, &step_vector,
9743                                      niters_no_overflow);
9744     }
9745
9746   /* 1) Make sure the loop header has exactly two entries
9747      2) Make sure we have a preheader basic block.  */
9748
9749   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
9750
9751   split_edge (loop_preheader_edge (loop));
9752
9753   if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
9754     /* This will deal with any possible peeling.  */
9755     vect_prepare_for_masked_peels (loop_vinfo);
9756
9757   /* Schedule the SLP instances first, then handle loop vectorization
9758      below.  */
9759   if (!loop_vinfo->slp_instances.is_empty ())
9760     {
9761       DUMP_VECT_SCOPE ("scheduling SLP instances");
9762       vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
9763     }
9764
9765   /* FORNOW: the vectorizer supports only loops which body consist
9766      of one basic block (header + empty latch). When the vectorizer will
9767      support more involved loop forms, the order by which the BBs are
9768      traversed need to be reconsidered.  */
9769
9770   for (i = 0; i < nbbs; i++)
9771     {
9772       basic_block bb = bbs[i];
9773       stmt_vec_info stmt_info;
9774
9775       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
9776            gsi_next (&si))
9777         {
9778           gphi *phi = si.phi ();
9779           if (dump_enabled_p ())
9780             dump_printf_loc (MSG_NOTE, vect_location,
9781                              "------>vectorizing phi: %G", phi);
9782           stmt_info = loop_vinfo->lookup_stmt (phi);
9783           if (!stmt_info)
9784             continue;
9785
9786           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
9787             vect_loop_kill_debug_uses (loop, stmt_info);
9788
9789           if (!STMT_VINFO_RELEVANT_P (stmt_info)
9790               && !STMT_VINFO_LIVE_P (stmt_info))
9791             continue;
9792
9793           if (STMT_VINFO_VECTYPE (stmt_info)
9794               && (maybe_ne
9795                   (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
9796               && dump_enabled_p ())
9797             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
9798
9799           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
9800                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
9801                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
9802                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
9803                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
9804               && ! PURE_SLP_STMT (stmt_info))
9805             {
9806               if (dump_enabled_p ())
9807                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
9808               vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL);
9809             }
9810         }
9811
9812       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
9813            gsi_next (&si))
9814         {
9815           gphi *phi = si.phi ();
9816           stmt_info = loop_vinfo->lookup_stmt (phi);
9817           if (!stmt_info)
9818             continue;
9819
9820           if (!STMT_VINFO_RELEVANT_P (stmt_info)
9821               && !STMT_VINFO_LIVE_P (stmt_info))
9822             continue;
9823
9824           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
9825                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
9826                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
9827                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
9828                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
9829               && ! PURE_SLP_STMT (stmt_info))
9830             maybe_set_vectorized_backedge_value (loop_vinfo, stmt_info);
9831         }
9832
9833       for (gimple_stmt_iterator si = gsi_start_bb (bb);
9834            !gsi_end_p (si);)
9835         {
9836           stmt = gsi_stmt (si);
9837           /* During vectorization remove existing clobber stmts.  */
9838           if (gimple_clobber_p (stmt))
9839             {
9840               unlink_stmt_vdef (stmt);
9841               gsi_remove (&si, true);
9842               release_defs (stmt);
9843             }
9844           else
9845             {
9846               /* Ignore vector stmts created in the outer loop.  */
9847               stmt_info = loop_vinfo->lookup_stmt (stmt);
9848
9849               /* vector stmts created in the outer-loop during vectorization of
9850                  stmts in an inner-loop may not have a stmt_info, and do not
9851                  need to be vectorized.  */
9852               stmt_vec_info seen_store = NULL;
9853               if (stmt_info)
9854                 {
9855                   if (STMT_VINFO_IN_PATTERN_P (stmt_info))
9856                     {
9857                       gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
9858                       for (gimple_stmt_iterator subsi = gsi_start (def_seq);
9859                            !gsi_end_p (subsi); gsi_next (&subsi))
9860                         {
9861                           stmt_vec_info pat_stmt_info
9862                             = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
9863                           vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
9864                                                     &si, &seen_store);
9865                         }
9866                       stmt_vec_info pat_stmt_info
9867                         = STMT_VINFO_RELATED_STMT (stmt_info);
9868                       if (vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
9869                                                     &si, &seen_store))
9870                         maybe_set_vectorized_backedge_value (loop_vinfo,
9871                                                              pat_stmt_info);
9872                     }
9873                   else
9874                     {
9875                       if (vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
9876                                                     &seen_store))
9877                         maybe_set_vectorized_backedge_value (loop_vinfo,
9878                                                              stmt_info);
9879                     }
9880                 }
9881               gsi_next (&si);
9882               if (seen_store)
9883                 {
9884                   if (STMT_VINFO_GROUPED_ACCESS (seen_store))
9885                     /* Interleaving.  If IS_STORE is TRUE, the
9886                        vectorization of the interleaving chain was
9887                        completed - free all the stores in the chain.  */
9888                     vect_remove_stores (loop_vinfo,
9889                                         DR_GROUP_FIRST_ELEMENT (seen_store));
9890                   else
9891                     /* Free the attached stmt_vec_info and remove the stmt.  */
9892                     loop_vinfo->remove_stmt (stmt_info);
9893                 }
9894             }
9895         }
9896
9897       /* Stub out scalar statements that must not survive vectorization.
9898          Doing this here helps with grouped statements, or statements that
9899          are involved in patterns.  */
9900       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
9901            !gsi_end_p (gsi); gsi_next (&gsi))
9902         {
9903           gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
9904           if (!call || !gimple_call_internal_p (call))
9905             continue;
9906           internal_fn ifn = gimple_call_internal_fn (call);
9907           if (ifn == IFN_MASK_LOAD)
9908             {
9909               tree lhs = gimple_get_lhs (call);
9910               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9911                 {
9912                   tree zero = build_zero_cst (TREE_TYPE (lhs));
9913                   gimple *new_stmt = gimple_build_assign (lhs, zero);
9914                   gsi_replace (&gsi, new_stmt, true);
9915                 }
9916             }
9917           else if (conditional_internal_fn_code (ifn) != ERROR_MARK)
9918             {
9919               tree lhs = gimple_get_lhs (call);
9920               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9921                 {
9922                   tree else_arg
9923                     = gimple_call_arg (call, gimple_call_num_args (call) - 1);
9924                   gimple *new_stmt = gimple_build_assign (lhs, else_arg);
9925                   gsi_replace (&gsi, new_stmt, true);
9926                 }
9927             }
9928         }
9929     }                           /* BBs in loop */
9930
9931   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
9932      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
9933   if (integer_onep (step_vector))
9934     niters_no_overflow = true;
9935   vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
9936                            niters_vector_mult_vf, !niters_no_overflow);
9937
9938   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
9939   scale_profile_for_vect_loop (loop, assumed_vf);
9940
9941   /* True if the final iteration might not handle a full vector's
9942      worth of scalar iterations.  */
9943   bool final_iter_may_be_partial
9944     = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo);
9945   /* The minimum number of iterations performed by the epilogue.  This
9946      is 1 when peeling for gaps because we always need a final scalar
9947      iteration.  */
9948   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
9949   /* +1 to convert latch counts to loop iteration counts,
9950      -min_epilogue_iters to remove iterations that cannot be performed
9951        by the vector code.  */
9952   int bias_for_lowest = 1 - min_epilogue_iters;
9953   int bias_for_assumed = bias_for_lowest;
9954   int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
9955   if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
9956     {
9957       /* When the amount of peeling is known at compile time, the first
9958          iteration will have exactly alignment_npeels active elements.
9959          In the worst case it will have at least one.  */
9960       int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
9961       bias_for_lowest += lowest_vf - min_first_active;
9962       bias_for_assumed += assumed_vf - min_first_active;
9963     }
9964   /* In these calculations the "- 1" converts loop iteration counts
9965      back to latch counts.  */
9966   if (loop->any_upper_bound)
9967     {
9968       loop_vec_info main_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
9969       loop->nb_iterations_upper_bound
9970         = (final_iter_may_be_partial
9971            ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
9972                             lowest_vf) - 1
9973            : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
9974                              lowest_vf) - 1);
9975       if (main_vinfo
9976           /* Both peeling for alignment and peeling for gaps can end up
9977              with the scalar epilogue running for more than VF-1 iterations.  */
9978           && !main_vinfo->peeling_for_alignment
9979           && !main_vinfo->peeling_for_gaps)
9980         {
9981           unsigned int bound;
9982           poly_uint64 main_iters
9983             = upper_bound (LOOP_VINFO_VECT_FACTOR (main_vinfo),
9984                            LOOP_VINFO_COST_MODEL_THRESHOLD (main_vinfo));
9985           main_iters
9986             = upper_bound (main_iters,
9987                            LOOP_VINFO_VERSIONING_THRESHOLD (main_vinfo));
9988           if (can_div_away_from_zero_p (main_iters,
9989                                         LOOP_VINFO_VECT_FACTOR (loop_vinfo),
9990                                         &bound))
9991             loop->nb_iterations_upper_bound
9992               = wi::umin ((widest_int) (bound - 1),
9993                           loop->nb_iterations_upper_bound);
9994       }
9995   }
9996   if (loop->any_likely_upper_bound)
9997     loop->nb_iterations_likely_upper_bound
9998       = (final_iter_may_be_partial
9999          ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
10000                           + bias_for_lowest, lowest_vf) - 1
10001          : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
10002                            + bias_for_lowest, lowest_vf) - 1);
10003   if (loop->any_estimate)
10004     loop->nb_iterations_estimate
10005       = (final_iter_may_be_partial
10006          ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
10007                           assumed_vf) - 1
10008          : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
10009                            assumed_vf) - 1);
10010
10011   if (dump_enabled_p ())
10012     {
10013       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
10014         {
10015           dump_printf_loc (MSG_NOTE, vect_location,
10016                            "LOOP VECTORIZED\n");
10017           if (loop->inner)
10018             dump_printf_loc (MSG_NOTE, vect_location,
10019                              "OUTER LOOP VECTORIZED\n");
10020           dump_printf (MSG_NOTE, "\n");
10021         }
10022       else
10023         dump_printf_loc (MSG_NOTE, vect_location,
10024                          "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
10025                          GET_MODE_NAME (loop_vinfo->vector_mode));
10026     }
10027
10028   /* Loops vectorized with a variable factor won't benefit from
10029      unrolling/peeling.  */
10030   if (!vf.is_constant ())
10031     {
10032       loop->unroll = 1;
10033       if (dump_enabled_p ())
10034         dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
10035                          " variable-length vectorization factor\n");
10036     }
10037   /* Free SLP instances here because otherwise stmt reference counting
10038      won't work.  */
10039   slp_instance instance;
10040   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
10041     vect_free_slp_instance (instance);
10042   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
10043   /* Clear-up safelen field since its value is invalid after vectorization
10044      since vectorized loop can have loop-carried dependencies.  */
10045   loop->safelen = 0;
10046
10047   if (epilogue)
10048     {
10049       update_epilogue_loop_vinfo (epilogue, advance);
10050
10051       epilogue->simduid = loop->simduid;
10052       epilogue->force_vectorize = loop->force_vectorize;
10053       epilogue->dont_vectorize = false;
10054     }
10055
10056   return epilogue;
10057 }
10058
10059 /* The code below is trying to perform simple optimization - revert
10060    if-conversion for masked stores, i.e. if the mask of a store is zero
10061    do not perform it and all stored value producers also if possible.
10062    For example,
10063      for (i=0; i<n; i++)
10064        if (c[i])
10065         {
10066           p1[i] += 1;
10067           p2[i] = p3[i] +2;
10068         }
10069    this transformation will produce the following semi-hammock:
10070
10071    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
10072      {
10073        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
10074        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
10075        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
10076        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
10077        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
10078        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
10079      }
10080 */
10081
10082 void
10083 optimize_mask_stores (class loop *loop)
10084 {
10085   basic_block *bbs = get_loop_body (loop);
10086   unsigned nbbs = loop->num_nodes;
10087   unsigned i;
10088   basic_block bb;
10089   class loop *bb_loop;
10090   gimple_stmt_iterator gsi;
10091   gimple *stmt;
10092   auto_vec<gimple *> worklist;
10093   auto_purge_vect_location sentinel;
10094
10095   vect_location = find_loop_location (loop);
10096   /* Pick up all masked stores in loop if any.  */
10097   for (i = 0; i < nbbs; i++)
10098     {
10099       bb = bbs[i];
10100       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
10101            gsi_next (&gsi))
10102         {
10103           stmt = gsi_stmt (gsi);
10104           if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
10105             worklist.safe_push (stmt);
10106         }
10107     }
10108
10109   free (bbs);
10110   if (worklist.is_empty ())
10111     return;
10112
10113   /* Loop has masked stores.  */
10114   while (!worklist.is_empty ())
10115     {
10116       gimple *last, *last_store;
10117       edge e, efalse;
10118       tree mask;
10119       basic_block store_bb, join_bb;
10120       gimple_stmt_iterator gsi_to;
10121       tree vdef, new_vdef;
10122       gphi *phi;
10123       tree vectype;
10124       tree zero;
10125
10126       last = worklist.pop ();
10127       mask = gimple_call_arg (last, 2);
10128       bb = gimple_bb (last);
10129       /* Create then_bb and if-then structure in CFG, then_bb belongs to
10130          the same loop as if_bb.  It could be different to LOOP when two
10131          level loop-nest is vectorized and mask_store belongs to the inner
10132          one.  */
10133       e = split_block (bb, last);
10134       bb_loop = bb->loop_father;
10135       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
10136       join_bb = e->dest;
10137       store_bb = create_empty_bb (bb);
10138       add_bb_to_loop (store_bb, bb_loop);
10139       e->flags = EDGE_TRUE_VALUE;
10140       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
10141       /* Put STORE_BB to likely part.  */
10142       efalse->probability = profile_probability::unlikely ();
10143       store_bb->count = efalse->count ();
10144       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
10145       if (dom_info_available_p (CDI_DOMINATORS))
10146         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
10147       if (dump_enabled_p ())
10148         dump_printf_loc (MSG_NOTE, vect_location,
10149                          "Create new block %d to sink mask stores.",
10150                          store_bb->index);
10151       /* Create vector comparison with boolean result.  */
10152       vectype = TREE_TYPE (mask);
10153       zero = build_zero_cst (vectype);
10154       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
10155       gsi = gsi_last_bb (bb);
10156       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
10157       /* Create new PHI node for vdef of the last masked store:
10158          .MEM_2 = VDEF <.MEM_1>
10159          will be converted to
10160          .MEM.3 = VDEF <.MEM_1>
10161          and new PHI node will be created in join bb
10162          .MEM_2 = PHI <.MEM_1, .MEM_3>
10163       */
10164       vdef = gimple_vdef (last);
10165       new_vdef = make_ssa_name (gimple_vop (cfun), last);
10166       gimple_set_vdef (last, new_vdef);
10167       phi = create_phi_node (vdef, join_bb);
10168       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
10169
10170       /* Put all masked stores with the same mask to STORE_BB if possible.  */
10171       while (true)
10172         {
10173           gimple_stmt_iterator gsi_from;
10174           gimple *stmt1 = NULL;
10175
10176           /* Move masked store to STORE_BB.  */
10177           last_store = last;
10178           gsi = gsi_for_stmt (last);
10179           gsi_from = gsi;
10180           /* Shift GSI to the previous stmt for further traversal.  */
10181           gsi_prev (&gsi);
10182           gsi_to = gsi_start_bb (store_bb);
10183           gsi_move_before (&gsi_from, &gsi_to);
10184           /* Setup GSI_TO to the non-empty block start.  */
10185           gsi_to = gsi_start_bb (store_bb);
10186           if (dump_enabled_p ())
10187             dump_printf_loc (MSG_NOTE, vect_location,
10188                              "Move stmt to created bb\n%G", last);
10189           /* Move all stored value producers if possible.  */
10190           while (!gsi_end_p (gsi))
10191             {
10192               tree lhs;
10193               imm_use_iterator imm_iter;
10194               use_operand_p use_p;
10195               bool res;
10196
10197               /* Skip debug statements.  */
10198               if (is_gimple_debug (gsi_stmt (gsi)))
10199                 {
10200                   gsi_prev (&gsi);
10201                   continue;
10202                 }
10203               stmt1 = gsi_stmt (gsi);
10204               /* Do not consider statements writing to memory or having
10205                  volatile operand.  */
10206               if (gimple_vdef (stmt1)
10207                   || gimple_has_volatile_ops (stmt1))
10208                 break;
10209               gsi_from = gsi;
10210               gsi_prev (&gsi);
10211               lhs = gimple_get_lhs (stmt1);
10212               if (!lhs)
10213                 break;
10214
10215               /* LHS of vectorized stmt must be SSA_NAME.  */
10216               if (TREE_CODE (lhs) != SSA_NAME)
10217                 break;
10218
10219               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
10220                 {
10221                   /* Remove dead scalar statement.  */
10222                   if (has_zero_uses (lhs))
10223                     {
10224                       gsi_remove (&gsi_from, true);
10225                       continue;
10226                     }
10227                 }
10228
10229               /* Check that LHS does not have uses outside of STORE_BB.  */
10230               res = true;
10231               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
10232                 {
10233                   gimple *use_stmt;
10234                   use_stmt = USE_STMT (use_p);
10235                   if (is_gimple_debug (use_stmt))
10236                     continue;
10237                   if (gimple_bb (use_stmt) != store_bb)
10238                     {
10239                       res = false;
10240                       break;
10241                     }
10242                 }
10243               if (!res)
10244                 break;
10245
10246               if (gimple_vuse (stmt1)
10247                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
10248                 break;
10249
10250               /* Can move STMT1 to STORE_BB.  */
10251               if (dump_enabled_p ())
10252                 dump_printf_loc (MSG_NOTE, vect_location,
10253                                  "Move stmt to created bb\n%G", stmt1);
10254               gsi_move_before (&gsi_from, &gsi_to);
10255               /* Shift GSI_TO for further insertion.  */
10256               gsi_prev (&gsi_to);
10257             }
10258           /* Put other masked stores with the same mask to STORE_BB.  */
10259           if (worklist.is_empty ()
10260               || gimple_call_arg (worklist.last (), 2) != mask
10261               || worklist.last () != stmt1)
10262             break;
10263           last = worklist.pop ();
10264         }
10265       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
10266     }
10267 }
10268
10269 /* Decide whether it is possible to use a zero-based induction variable
10270    when vectorizing LOOP_VINFO with partial vectors.  If it is, return
10271    the value that the induction variable must be able to hold in order
10272    to ensure that the rgroups eventually have no active vector elements.
10273    Return -1 otherwise.  */
10274
10275 widest_int
10276 vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
10277 {
10278   tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
10279   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10280   unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
10281
10282   /* Calculate the value that the induction variable must be able
10283      to hit in order to ensure that we end the loop with an all-false mask.
10284      This involves adding the maximum number of inactive trailing scalar
10285      iterations.  */
10286   widest_int iv_limit = -1;
10287   if (max_loop_iterations (loop, &iv_limit))
10288     {
10289       if (niters_skip)
10290         {
10291           /* Add the maximum number of skipped iterations to the
10292              maximum iteration count.  */
10293           if (TREE_CODE (niters_skip) == INTEGER_CST)
10294             iv_limit += wi::to_widest (niters_skip);
10295           else
10296             iv_limit += max_vf - 1;
10297         }
10298       else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
10299         /* Make a conservatively-correct assumption.  */
10300         iv_limit += max_vf - 1;
10301
10302       /* IV_LIMIT is the maximum number of latch iterations, which is also
10303          the maximum in-range IV value.  Round this value down to the previous
10304          vector alignment boundary and then add an extra full iteration.  */
10305       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
10306       iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
10307     }
10308   return iv_limit;
10309 }
10310
10311 /* For the given rgroup_controls RGC, check whether an induction variable
10312    would ever hit a value that produces a set of all-false masks or zero
10313    lengths before wrapping around.  Return true if it's possible to wrap
10314    around before hitting the desirable value, otherwise return false.  */
10315
10316 bool
10317 vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
10318 {
10319   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
10320
10321   if (iv_limit == -1)
10322     return true;
10323
10324   tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
10325   unsigned int compare_precision = TYPE_PRECISION (compare_type);
10326   unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
10327
10328   if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
10329     return true;
10330
10331   return false;
10332 }