gcc/tree-vect-loop.cc

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2022 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #define INCLUDE_ALGORITHM
  23 #include "config.h"
  24 #include "system.h"
  25 #include "coretypes.h"
  26 #include "backend.h"
  27 #include "target.h"
  28 #include "rtl.h"
  29 #include "tree.h"
  30 #include "gimple.h"
  31 #include "cfghooks.h"
  32 #include "tree-pass.h"
  33 #include "ssa.h"
  34 #include "optabs-tree.h"
  35 #include "diagnostic-core.h"
  36 #include "fold-const.h"
  37 #include "stor-layout.h"
  38 #include "cfganal.h"
  39 #include "gimplify.h"
  40 #include "gimple-iterator.h"
  41 #include "gimplify-me.h"
  42 #include "tree-ssa-loop-ivopts.h"
  43 #include "tree-ssa-loop-manip.h"
  44 #include "tree-ssa-loop-niter.h"
  45 #include "tree-ssa-loop.h"
  46 #include "cfgloop.h"
  47 #include "tree-scalar-evolution.h"
  48 #include "tree-vectorizer.h"
  49 #include "gimple-fold.h"
  50 #include "cgraph.h"
  51 #include "tree-cfg.h"
  52 #include "tree-if-conv.h"
  53 #include "internal-fn.h"
  54 #include "tree-vector-builder.h"
  55 #include "vec-perm-indices.h"
  56 #include "tree-eh.h"
  57 #include "case-cfn-macros.h"
  58
  59 /* Loop Vectorization Pass.
  60
  61    This pass tries to vectorize loops.
  62
  63    For example, the vectorizer transforms the following simple loop:
  64
  65         short a[N]; short b[N]; short c[N]; int i;
  66
  67         for (i=0; i<N; i++){
  68           a[i] = b[i] + c[i];
  69         }
  70
  71    as if it was manually vectorized by rewriting the source code into:
  72
  73         typedef int __attribute__((mode(V8HI))) v8hi;
  74         short a[N];  short b[N]; short c[N];   int i;
  75         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  76         v8hi va, vb, vc;
  77
  78         for (i=0; i<N/8; i++){
  79           vb = pb[i];
  80           vc = pc[i];
  81           va = vb + vc;
  82           pa[i] = va;
  83         }
  84
  85         The main entry to this pass is vectorize_loops(), in which
  86    the vectorizer applies a set of analyses on a given set of loops,
  87    followed by the actual vectorization transformation for the loops that
  88    had successfully passed the analysis phase.
  89         Throughout this pass we make a distinction between two types of
  90    data: scalars (which are represented by SSA_NAMES), and memory references
  91    ("data-refs").  These two types of data require different handling both
  92    during analysis and transformation. The types of data-refs that the
  93    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  94    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  95    accesses are required to have a simple (consecutive) access pattern.
  96
  97    Analysis phase:
  98    ===============
  99         The driver for the analysis phase is vect_analyze_loop().
 100    It applies a set of analyses, some of which rely on the scalar evolution
 101    analyzer (scev) developed by Sebastian Pop.
 102
 103         During the analysis phase the vectorizer records some information
 104    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 105    loop, as well as general information about the loop as a whole, which is
 106    recorded in a "loop_vec_info" struct attached to each loop.
 107
 108    Transformation phase:
 109    =====================
 110         The loop transformation phase scans all the stmts in the loop, and
 111    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 112    the loop that needs to be vectorized.  It inserts the vector code sequence
 113    just before the scalar stmt S, and records a pointer to the vector code
 114    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 115    attached to S).  This pointer will be used for the vectorization of following
 116    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 117    otherwise, we rely on dead code elimination for removing it.
 118
 119         For example, say stmt S1 was vectorized into stmt VS1:
 120
 121    VS1: vb = px[i];
 122    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 123    S2:  a = b;
 124
 125    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 126    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 127    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 128    resulting sequence would be:
 129
 130    VS1: vb = px[i];
 131    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 132    VS2: va = vb;
 133    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 134
 135         Operands that are not SSA_NAMEs, are data-refs that appear in
 136    load/store operations (like 'x[i]' in S1), and are handled differently.
 137
 138    Target modeling:
 139    =================
 140         Currently the only target specific information that is used is the
 141    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 142    Targets that can support different sizes of vectors, for now will need
 143    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 144    flexibility will be added in the future.
 145
 146         Since we only vectorize operations which vector form can be
 147    expressed using existing tree codes, to verify that an operation is
 148    supported, the vectorizer checks the relevant optab at the relevant
 149    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 150    the value found is CODE_FOR_nothing, then there's no target support, and
 151    we can't vectorize the stmt.
 152
 153    For additional information on this project see:
 154    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 155 */
 156
 157 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *,
 158                                                 unsigned *);
 159 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
 160                                                bool *, bool *);
 161
 162 /* Subroutine of vect_determine_vf_for_stmt that handles only one
 163    statement.  VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
 164    may already be set for general statements (not just data refs).  */
 165
 166 static opt_result
 167 vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
 168                               bool vectype_maybe_set_p,
 169                               poly_uint64 *vf)
 170 {
 171   gimple *stmt = stmt_info->stmt;
 172
 173   if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 174        && !STMT_VINFO_LIVE_P (stmt_info))
 175       || gimple_clobber_p (stmt))
 176     {
 177       if (dump_enabled_p ())
 178         dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 179       return opt_result::success ();
 180     }
 181
 182   tree stmt_vectype, nunits_vectype;
 183   opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info,
 184                                                    &stmt_vectype,
 185                                                    &nunits_vectype);
 186   if (!res)
 187     return res;
 188
 189   if (stmt_vectype)
 190     {
 191       if (STMT_VINFO_VECTYPE (stmt_info))
 192         /* The only case when a vectype had been already set is for stmts
 193            that contain a data ref, or for "pattern-stmts" (stmts generated
 194            by the vectorizer to represent/replace a certain idiom).  */
 195         gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
 196                      || vectype_maybe_set_p)
 197                     && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
 198       else
 199         STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
 200     }
 201
 202   if (nunits_vectype)
 203     vect_update_max_nunits (vf, nunits_vectype);
 204
 205   return opt_result::success ();
 206 }
 207
 208 /* Subroutine of vect_determine_vectorization_factor.  Set the vector
 209    types of STMT_INFO and all attached pattern statements and update
 210    the vectorization factor VF accordingly.  Return true on success
 211    or false if something prevented vectorization.  */
 212
 213 static opt_result
 214 vect_determine_vf_for_stmt (vec_info *vinfo,
 215                             stmt_vec_info stmt_info, poly_uint64 *vf)
 216 {
 217   if (dump_enabled_p ())
 218     dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
 219                      stmt_info->stmt);
 220   opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, false, vf);
 221   if (!res)
 222     return res;
 223
 224   if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 225       && STMT_VINFO_RELATED_STMT (stmt_info))
 226     {
 227       gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 228       stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
 229
 230       /* If a pattern statement has def stmts, analyze them too.  */
 231       for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
 232            !gsi_end_p (si); gsi_next (&si))
 233         {
 234           stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
 235           if (dump_enabled_p ())
 236             dump_printf_loc (MSG_NOTE, vect_location,
 237                              "==> examining pattern def stmt: %G",
 238                              def_stmt_info->stmt);
 239           res = vect_determine_vf_for_stmt_1 (vinfo, def_stmt_info, true, vf);
 240           if (!res)
 241             return res;
 242         }
 243
 244       if (dump_enabled_p ())
 245         dump_printf_loc (MSG_NOTE, vect_location,
 246                          "==> examining pattern statement: %G",
 247                          stmt_info->stmt);
 248       res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, true, vf);
 249       if (!res)
 250         return res;
 251     }
 252
 253   return opt_result::success ();
 254 }
 255
 256 /* Function vect_determine_vectorization_factor
 257
 258    Determine the vectorization factor (VF).  VF is the number of data elements
 259    that are operated upon in parallel in a single iteration of the vectorized
 260    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 261    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 262    elements can fit in a single vector register.
 263
 264    We currently support vectorization of loops in which all types operated upon
 265    are of the same size.  Therefore this function currently sets VF according to
 266    the size of the types operated upon, and fails if there are multiple sizes
 267    in the loop.
 268
 269    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 270    original loop:
 271         for (i=0; i<N; i++){
 272           a[i] = b[i] + c[i];
 273         }
 274
 275    vectorized loop:
 276         for (i=0; i<N; i+=VF){
 277           a[i:VF] = b[i:VF] + c[i:VF];
 278         }
 279 */
 280
 281 static opt_result
 282 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 283 {
 284   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 285   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 286   unsigned nbbs = loop->num_nodes;
 287   poly_uint64 vectorization_factor = 1;
 288   tree scalar_type = NULL_TREE;
 289   gphi *phi;
 290   tree vectype;
 291   stmt_vec_info stmt_info;
 292   unsigned i;
 293
 294   DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
 295
 296   for (i = 0; i < nbbs; i++)
 297     {
 298       basic_block bb = bbs[i];
 299
 300       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 301            gsi_next (&si))
 302         {
 303           phi = si.phi ();
 304           stmt_info = loop_vinfo->lookup_stmt (phi);
 305           if (dump_enabled_p ())
 306             dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
 307                              phi);
 308
 309           gcc_assert (stmt_info);
 310
 311           if (STMT_VINFO_RELEVANT_P (stmt_info)
 312               || STMT_VINFO_LIVE_P (stmt_info))
 313             {
 314               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 315               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 316
 317               if (dump_enabled_p ())
 318                 dump_printf_loc (MSG_NOTE, vect_location,
 319                                  "get vectype for scalar type:  %T\n",
 320                                  scalar_type);
 321
 322               vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
 323               if (!vectype)
 324                 return opt_result::failure_at (phi,
 325                                                "not vectorized: unsupported "
 326                                                "data-type %T\n",
 327                                                scalar_type);
 328               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 329
 330               if (dump_enabled_p ())
 331                 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
 332                                  vectype);
 333
 334               if (dump_enabled_p ())
 335                 {
 336                   dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
 337                   dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
 338                   dump_printf (MSG_NOTE, "\n");
 339                 }
 340
 341               vect_update_max_nunits (&vectorization_factor, vectype);
 342             }
 343         }
 344
 345       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
 346            gsi_next (&si))
 347         {
 348           if (is_gimple_debug (gsi_stmt (si)))
 349             continue;
 350           stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
 351           opt_result res
 352             = vect_determine_vf_for_stmt (loop_vinfo,
 353                                           stmt_info, &vectorization_factor);
 354           if (!res)
 355             return res;
 356         }
 357     }
 358
 359   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 360   if (dump_enabled_p ())
 361     {
 362       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
 363       dump_dec (MSG_NOTE, vectorization_factor);
 364       dump_printf (MSG_NOTE, "\n");
 365     }
 366
 367   if (known_le (vectorization_factor, 1U))
 368     return opt_result::failure_at (vect_location,
 369                                    "not vectorized: unsupported data-type\n");
 370   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 371   return opt_result::success ();
 372 }
 373
 374
 375 /* Function vect_is_simple_iv_evolution.
 376
 377    FORNOW: A simple evolution of an induction variables in the loop is
 378    considered a polynomial evolution.  */
 379
 380 static bool
 381 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 382                              tree * step)
 383 {
 384   tree init_expr;
 385   tree step_expr;
 386   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 387   basic_block bb;
 388
 389   /* When there is no evolution in this loop, the evolution function
 390      is not "simple".  */
 391   if (evolution_part == NULL_TREE)
 392     return false;
 393
 394   /* When the evolution is a polynomial of degree >= 2
 395      the evolution function is not "simple".  */
 396   if (tree_is_chrec (evolution_part))
 397     return false;
 398
 399   step_expr = evolution_part;
 400   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 401
 402   if (dump_enabled_p ())
 403     dump_printf_loc (MSG_NOTE, vect_location, "step: %T,  init: %T\n",
 404                      step_expr, init_expr);
 405
 406   *init = init_expr;
 407   *step = step_expr;
 408
 409   if (TREE_CODE (step_expr) != INTEGER_CST
 410       && (TREE_CODE (step_expr) != SSA_NAME
 411           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 412               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 413           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 414               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 415                   || !flag_associative_math)))
 416       && (TREE_CODE (step_expr) != REAL_CST
 417           || !flag_associative_math))
 418     {
 419       if (dump_enabled_p ())
 420         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 421                          "step unknown.\n");
 422       return false;
 423     }
 424
 425   return true;
 426 }
 427
 428 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
 429    what we are assuming is a double reduction.  For example, given
 430    a structure like this:
 431
 432       outer1:
 433         x_1 = PHI <x_4(outer2), ...>;
 434         ...
 435
 436       inner:
 437         x_2 = PHI <x_1(outer1), ...>;
 438         ...
 439         x_3 = ...;
 440         ...
 441
 442       outer2:
 443         x_4 = PHI <x_3(inner)>;
 444         ...
 445
 446    outer loop analysis would treat x_1 as a double reduction phi and
 447    this function would then return true for x_2.  */
 448
 449 static bool
 450 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
 451 {
 452   use_operand_p use_p;
 453   ssa_op_iter op_iter;
 454   FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
 455     if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
 456       if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
 457         return true;
 458   return false;
 459 }
 460
 461 /* Function vect_analyze_scalar_cycles_1.
 462
 463    Examine the cross iteration def-use cycles of scalar variables
 464    in LOOP.  LOOP_VINFO represents the loop that is now being
 465    considered for vectorization (can be LOOP, or an outer-loop
 466    enclosing LOOP).  */
 467
 468 static void
 469 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop)
 470 {
 471   basic_block bb = loop->header;
 472   tree init, step;
 473   auto_vec<stmt_vec_info, 64> worklist;
 474   gphi_iterator gsi;
 475   bool double_reduc, reduc_chain;
 476
 477   DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
 478
 479   /* First - identify all inductions.  Reduction detection assumes that all the
 480      inductions have been identified, therefore, this order must not be
 481      changed.  */
 482   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 483     {
 484       gphi *phi = gsi.phi ();
 485       tree access_fn = NULL;
 486       tree def = PHI_RESULT (phi);
 487       stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
 488
 489       if (dump_enabled_p ())
 490         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
 491
 492       /* Skip virtual phi's.  The data dependences that are associated with
 493          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 494       if (virtual_operand_p (def))
 495         continue;
 496
 497       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 498
 499       /* Analyze the evolution function.  */
 500       access_fn = analyze_scalar_evolution (loop, def);
 501       if (access_fn)
 502         {
 503           STRIP_NOPS (access_fn);
 504           if (dump_enabled_p ())
 505             dump_printf_loc (MSG_NOTE, vect_location,
 506                              "Access function of PHI: %T\n", access_fn);
 507           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 508             = initial_condition_in_loop_num (access_fn, loop->num);
 509           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 510             = evolution_part_in_loop_num (access_fn, loop->num);
 511         }
 512
 513       if (!access_fn
 514           || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
 515           || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
 516           || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 517               && TREE_CODE (step) != INTEGER_CST))
 518         {
 519           worklist.safe_push (stmt_vinfo);
 520           continue;
 521         }
 522
 523       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 524                   != NULL_TREE);
 525       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 526
 527       if (dump_enabled_p ())
 528         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 529       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 530     }
 531
 532
 533   /* Second - identify all reductions and nested cycles.  */
 534   while (worklist.length () > 0)
 535     {
 536       stmt_vec_info stmt_vinfo = worklist.pop ();
 537       gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
 538       tree def = PHI_RESULT (phi);
 539
 540       if (dump_enabled_p ())
 541         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
 542
 543       gcc_assert (!virtual_operand_p (def)
 544                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 545
 546       stmt_vec_info reduc_stmt_info
 547         = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
 548                                     &reduc_chain);
 549       if (reduc_stmt_info)
 550         {
 551           STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
 552           STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
 553           if (double_reduc)
 554             {
 555               if (dump_enabled_p ())
 556                 dump_printf_loc (MSG_NOTE, vect_location,
 557                                  "Detected double reduction.\n");
 558
 559               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 560               STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
 561             }
 562           else
 563             {
 564               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
 565                 {
 566                   if (dump_enabled_p ())
 567                     dump_printf_loc (MSG_NOTE, vect_location,
 568                                      "Detected vectorizable nested cycle.\n");
 569
 570                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 571                 }
 572               else
 573                 {
 574                   if (dump_enabled_p ())
 575                     dump_printf_loc (MSG_NOTE, vect_location,
 576                                      "Detected reduction.\n");
 577
 578                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 579                   STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
 580                   /* Store the reduction cycles for possible vectorization in
 581                      loop-aware SLP if it was not detected as reduction
 582                      chain.  */
 583                   if (! reduc_chain)
 584                     LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
 585                       (reduc_stmt_info);
 586                 }
 587             }
 588         }
 589       else
 590         if (dump_enabled_p ())
 591           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 592                            "Unknown def-use cycle pattern.\n");
 593     }
 594 }
 595
 596
 597 /* Function vect_analyze_scalar_cycles.
 598
 599    Examine the cross iteration def-use cycles of scalar variables, by
 600    analyzing the loop-header PHIs of scalar variables.  Classify each
 601    cycle as one of the following: invariant, induction, reduction, unknown.
 602    We do that for the loop represented by LOOP_VINFO, and also to its
 603    inner-loop, if exists.
 604    Examples for scalar cycles:
 605
 606    Example1: reduction:
 607
 608               loop1:
 609               for (i=0; i<N; i++)
 610                  sum += a[i];
 611
 612    Example2: induction:
 613
 614               loop2:
 615               for (i=0; i<N; i++)
 616                  a[i] = i;  */
 617
 618 static void
 619 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 620 {
 621   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 622
 623   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 624
 625   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 626      Reductions in such inner-loop therefore have different properties than
 627      the reductions in the nest that gets vectorized:
 628      1. When vectorized, they are executed in the same order as in the original
 629         scalar loop, so we can't change the order of computation when
 630         vectorizing them.
 631      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 632         current checks are too strict.  */
 633
 634   if (loop->inner)
 635     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 636 }
 637
 638 /* Transfer group and reduction information from STMT_INFO to its
 639    pattern stmt.  */
 640
 641 static void
 642 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
 643 {
 644   stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
 645   stmt_vec_info stmtp;
 646   gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
 647               && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
 648   REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
 649   do
 650     {
 651       stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
 652       gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
 653                            == STMT_VINFO_DEF_TYPE (stmt_info));
 654       REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
 655       stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
 656       if (stmt_info)
 657         REDUC_GROUP_NEXT_ELEMENT (stmtp)
 658           = STMT_VINFO_RELATED_STMT (stmt_info);
 659     }
 660   while (stmt_info);
 661 }
 662
 663 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 664
 665 static void
 666 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 667 {
 668   stmt_vec_info first;
 669   unsigned i;
 670
 671   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 672     {
 673       stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
 674       while (next)
 675         {
 676           if ((STMT_VINFO_IN_PATTERN_P (next)
 677                != STMT_VINFO_IN_PATTERN_P (first))
 678               || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next)) == -1)
 679             break;
 680           next = REDUC_GROUP_NEXT_ELEMENT (next);
 681         }
 682       /* If all reduction chain members are well-formed patterns adjust
 683          the group to group the pattern stmts instead.  */
 684       if (! next
 685           && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first)) != -1)
 686         {
 687           if (STMT_VINFO_IN_PATTERN_P (first))
 688             {
 689               vect_fixup_reduc_chain (first);
 690               LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
 691                 = STMT_VINFO_RELATED_STMT (first);
 692             }
 693         }
 694       /* If not all stmt in the chain are patterns or if we failed
 695          to update STMT_VINFO_REDUC_IDX dissolve the chain and handle
 696          it as regular reduction instead.  */
 697       else
 698         {
 699           stmt_vec_info vinfo = first;
 700           stmt_vec_info last = NULL;
 701           while (vinfo)
 702             {
 703               next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
 704               REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
 705               REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
 706               last = vinfo;
 707               vinfo = next;
 708             }
 709           STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first))
 710             = vect_internal_def;
 711           loop_vinfo->reductions.safe_push (vect_stmt_to_vectorize (last));
 712           LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).unordered_remove (i);
 713           --i;
 714         }
 715     }
 716 }
 717
 718 /* Function vect_get_loop_niters.
 719
 720    Determine how many iterations the loop is executed and place it
 721    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
 722    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
 723    niter information holds in ASSUMPTIONS.
 724
 725    Return the loop exit condition.  */
 726
 727
 728 static gcond *
 729 vect_get_loop_niters (class loop *loop, tree *assumptions,
 730                       tree *number_of_iterations, tree *number_of_iterationsm1)
 731 {
 732   edge exit = single_exit (loop);
 733   class tree_niter_desc niter_desc;
 734   tree niter_assumptions, niter, may_be_zero;
 735   gcond *cond = get_loop_exit_condition (loop);
 736
 737   *assumptions = boolean_true_node;
 738   *number_of_iterationsm1 = chrec_dont_know;
 739   *number_of_iterations = chrec_dont_know;
 740   DUMP_VECT_SCOPE ("get_loop_niters");
 741
 742   if (!exit)
 743     return cond;
 744
 745   may_be_zero = NULL_TREE;
 746   if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
 747       || chrec_contains_undetermined (niter_desc.niter))
 748     return cond;
 749
 750   niter_assumptions = niter_desc.assumptions;
 751   may_be_zero = niter_desc.may_be_zero;
 752   niter = niter_desc.niter;
 753
 754   if (may_be_zero && integer_zerop (may_be_zero))
 755     may_be_zero = NULL_TREE;
 756
 757   if (may_be_zero)
 758     {
 759       if (COMPARISON_CLASS_P (may_be_zero))
 760         {
 761           /* Try to combine may_be_zero with assumptions, this can simplify
 762              computation of niter expression.  */
 763           if (niter_assumptions && !integer_nonzerop (niter_assumptions))
 764             niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
 765                                              niter_assumptions,
 766                                              fold_build1 (TRUTH_NOT_EXPR,
 767                                                           boolean_type_node,
 768                                                           may_be_zero));
 769           else
 770             niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
 771                                  build_int_cst (TREE_TYPE (niter), 0),
 772                                  rewrite_to_non_trapping_overflow (niter));
 773
 774           may_be_zero = NULL_TREE;
 775         }
 776       else if (integer_nonzerop (may_be_zero))
 777         {
 778           *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
 779           *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
 780           return cond;
 781         }
 782       else
 783         return cond;
 784     }
 785
 786   *assumptions = niter_assumptions;
 787   *number_of_iterationsm1 = niter;
 788
 789   /* We want the number of loop header executions which is the number
 790      of latch executions plus one.
 791      ???  For UINT_MAX latch executions this number overflows to zero
 792      for loops like do { n++; } while (n != 0);  */
 793   if (niter && !chrec_contains_undetermined (niter))
 794     niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
 795                           build_int_cst (TREE_TYPE (niter), 1));
 796   *number_of_iterations = niter;
 797
 798   return cond;
 799 }
 800
 801 /* Function bb_in_loop_p
 802
 803    Used as predicate for dfs order traversal of the loop bbs.  */
 804
 805 static bool
 806 bb_in_loop_p (const_basic_block bb, const void *data)
 807 {
 808   const class loop *const loop = (const class loop *)data;
 809   if (flow_bb_inside_loop_p (loop, bb))
 810     return true;
 811   return false;
 812 }
 813
 814
 815 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
 816    stmt_vec_info structs for all the stmts in LOOP_IN.  */
 817
 818 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
 819   : vec_info (vec_info::loop, shared),
 820     loop (loop_in),
 821     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
 822     num_itersm1 (NULL_TREE),
 823     num_iters (NULL_TREE),
 824     num_iters_unchanged (NULL_TREE),
 825     num_iters_assumptions (NULL_TREE),
 826     vector_costs (nullptr),
 827     scalar_costs (nullptr),
 828     th (0),
 829     versioning_threshold (0),
 830     vectorization_factor (0),
 831     main_loop_edge (nullptr),
 832     skip_main_loop_edge (nullptr),
 833     skip_this_loop_edge (nullptr),
 834     reusable_accumulators (),
 835     suggested_unroll_factor (1),
 836     max_vectorization_factor (0),
 837     mask_skip_niters (NULL_TREE),
 838     rgroup_compare_type (NULL_TREE),
 839     simd_if_cond (NULL_TREE),
 840     unaligned_dr (NULL),
 841     peeling_for_alignment (0),
 842     ptr_mask (0),
 843     ivexpr_map (NULL),
 844     scan_map (NULL),
 845     slp_unrolling_factor (1),
 846     inner_loop_cost_factor (param_vect_inner_loop_cost_factor),
 847     vectorizable (false),
 848     can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
 849     using_partial_vectors_p (false),
 850     epil_using_partial_vectors_p (false),
 851     partial_load_store_bias (0),
 852     peeling_for_gaps (false),
 853     peeling_for_niter (false),
 854     no_data_dependencies (false),
 855     has_mask_store (false),
 856     scalar_loop_scaling (profile_probability::uninitialized ()),
 857     scalar_loop (NULL),
 858     orig_loop_info (NULL)
 859 {
 860   /* CHECKME: We want to visit all BBs before their successors (except for
 861      latch blocks, for which this assertion wouldn't hold).  In the simple
 862      case of the loop forms we allow, a dfs order of the BBs would the same
 863      as reversed postorder traversal, so we are safe.  */
 864
 865   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 866                                           bbs, loop->num_nodes, loop);
 867   gcc_assert (nbbs == loop->num_nodes);
 868
 869   for (unsigned int i = 0; i < nbbs; i++)
 870     {
 871       basic_block bb = bbs[i];
 872       gimple_stmt_iterator si;
 873
 874       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 875         {
 876           gimple *phi = gsi_stmt (si);
 877           gimple_set_uid (phi, 0);
 878           add_stmt (phi);
 879         }
 880
 881       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 882         {
 883           gimple *stmt = gsi_stmt (si);
 884           gimple_set_uid (stmt, 0);
 885           if (is_gimple_debug (stmt))
 886             continue;
 887           add_stmt (stmt);
 888           /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
 889              third argument is the #pragma omp simd if (x) condition, when 0,
 890              loop shouldn't be vectorized, when non-zero constant, it should
 891              be vectorized normally, otherwise versioned with vectorized loop
 892              done if the condition is non-zero at runtime.  */
 893           if (loop_in->simduid
 894               && is_gimple_call (stmt)
 895               && gimple_call_internal_p (stmt)
 896               && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
 897               && gimple_call_num_args (stmt) >= 3
 898               && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
 899               && (loop_in->simduid
 900                   == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
 901             {
 902               tree arg = gimple_call_arg (stmt, 2);
 903               if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
 904                 simd_if_cond = arg;
 905               else
 906                 gcc_assert (integer_nonzerop (arg));
 907             }
 908         }
 909     }
 910
 911   epilogue_vinfos.create (6);
 912 }
 913
 914 /* Free all levels of rgroup CONTROLS.  */
 915
 916 void
 917 release_vec_loop_controls (vec<rgroup_controls> *controls)
 918 {
 919   rgroup_controls *rgc;
 920   unsigned int i;
 921   FOR_EACH_VEC_ELT (*controls, i, rgc)
 922     rgc->controls.release ();
 923   controls->release ();
 924 }
 925
 926 /* Free all memory used by the _loop_vec_info, as well as all the
 927    stmt_vec_info structs of all the stmts in the loop.  */
 928
 929 _loop_vec_info::~_loop_vec_info ()
 930 {
 931   free (bbs);
 932
 933   release_vec_loop_controls (&masks);
 934   release_vec_loop_controls (&lens);
 935   delete ivexpr_map;
 936   delete scan_map;
 937   epilogue_vinfos.release ();
 938   delete scalar_costs;
 939   delete vector_costs;
 940
 941   /* When we release an epiloge vinfo that we do not intend to use
 942      avoid clearing AUX of the main loop which should continue to
 943      point to the main loop vinfo since otherwise we'll leak that.  */
 944   if (loop->aux == this)
 945     loop->aux = NULL;
 946 }
 947
 948 /* Return an invariant or register for EXPR and emit necessary
 949    computations in the LOOP_VINFO loop preheader.  */
 950
 951 tree
 952 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
 953 {
 954   if (is_gimple_reg (expr)
 955       || is_gimple_min_invariant (expr))
 956     return expr;
 957
 958   if (! loop_vinfo->ivexpr_map)
 959     loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
 960   tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
 961   if (! cached)
 962     {
 963       gimple_seq stmts = NULL;
 964       cached = force_gimple_operand (unshare_expr (expr),
 965                                      &stmts, true, NULL_TREE);
 966       if (stmts)
 967         {
 968           edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
 969           gsi_insert_seq_on_edge_immediate (e, stmts);
 970         }
 971     }
 972   return cached;
 973 }
 974
 975 /* Return true if we can use CMP_TYPE as the comparison type to produce
 976    all masks required to mask LOOP_VINFO.  */
 977
 978 static bool
 979 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
 980 {
 981   rgroup_controls *rgm;
 982   unsigned int i;
 983   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
 984     if (rgm->type != NULL_TREE
 985         && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
 986                                             cmp_type, rgm->type,
 987                                             OPTIMIZE_FOR_SPEED))
 988       return false;
 989   return true;
 990 }
 991
 992 /* Calculate the maximum number of scalars per iteration for every
 993    rgroup in LOOP_VINFO.  */
 994
 995 static unsigned int
 996 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
 997 {
 998   unsigned int res = 1;
 999   unsigned int i;
1000   rgroup_controls *rgm;
1001   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1002     res = MAX (res, rgm->max_nscalars_per_iter);
1003   return res;
1004 }
1005
1006 /* Calculate the minimum precision necessary to represent:
1007
1008       MAX_NITERS * FACTOR
1009
1010    as an unsigned integer, where MAX_NITERS is the maximum number of
1011    loop header iterations for the original scalar form of LOOP_VINFO.  */
1012
1013 static unsigned
1014 vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
1015 {
1016   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1017
1018   /* Get the maximum number of iterations that is representable
1019      in the counter type.  */
1020   tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1021   widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1022
1023   /* Get a more refined estimate for the number of iterations.  */
1024   widest_int max_back_edges;
1025   if (max_loop_iterations (loop, &max_back_edges))
1026     max_ni = wi::smin (max_ni, max_back_edges + 1);
1027
1028   /* Work out how many bits we need to represent the limit.  */
1029   return wi::min_precision (max_ni * factor, UNSIGNED);
1030 }
1031
1032 /* True if the loop needs peeling or partial vectors when vectorized.  */
1033
1034 static bool
1035 vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
1036 {
1037   unsigned HOST_WIDE_INT const_vf;
1038   HOST_WIDE_INT max_niter
1039     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1040
1041   unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1042   if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1043     th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1044                                           (loop_vinfo));
1045
1046   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1047       && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1048     {
1049       /* Work out the (constant) number of iterations that need to be
1050          peeled for reasons other than niters.  */
1051       unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1052       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1053         peel_niter += 1;
1054       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1055                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1056         return true;
1057     }
1058   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1059       /* ??? When peeling for gaps but not alignment, we could
1060          try to check whether the (variable) niters is known to be
1061          VF * N + 1.  That's something of a niche case though.  */
1062       || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1063       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1064       || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1065            < (unsigned) exact_log2 (const_vf))
1066           /* In case of versioning, check if the maximum number of
1067              iterations is greater than th.  If they are identical,
1068              the epilogue is unnecessary.  */
1069           && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1070               || ((unsigned HOST_WIDE_INT) max_niter
1071                   > (th / const_vf) * const_vf))))
1072     return true;
1073
1074   return false;
1075 }
1076
1077 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1078    whether we can actually generate the masks required.  Return true if so,
1079    storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE.  */
1080
1081 static bool
1082 vect_verify_full_masking (loop_vec_info loop_vinfo)
1083 {
1084   unsigned int min_ni_width;
1085   unsigned int max_nscalars_per_iter
1086     = vect_get_max_nscalars_per_iter (loop_vinfo);
1087
1088   /* Use a normal loop if there are no statements that need masking.
1089      This only happens in rare degenerate cases: it means that the loop
1090      has no loads, no stores, and no live-out values.  */
1091   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1092     return false;
1093
1094   /* Work out how many bits we need to represent the limit.  */
1095   min_ni_width
1096     = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1097
1098   /* Find a scalar mode for which WHILE_ULT is supported.  */
1099   opt_scalar_int_mode cmp_mode_iter;
1100   tree cmp_type = NULL_TREE;
1101   tree iv_type = NULL_TREE;
1102   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1103   unsigned int iv_precision = UINT_MAX;
1104
1105   if (iv_limit != -1)
1106     iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1107                                       UNSIGNED);
1108
1109   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1110     {
1111       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1112       if (cmp_bits >= min_ni_width
1113           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1114         {
1115           tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1116           if (this_type
1117               && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1118             {
1119               /* Although we could stop as soon as we find a valid mode,
1120                  there are at least two reasons why that's not always the
1121                  best choice:
1122
1123                  - An IV that's Pmode or wider is more likely to be reusable
1124                    in address calculations than an IV that's narrower than
1125                    Pmode.
1126
1127                  - Doing the comparison in IV_PRECISION or wider allows
1128                    a natural 0-based IV, whereas using a narrower comparison
1129                    type requires mitigations against wrap-around.
1130
1131                  Conversely, if the IV limit is variable, doing the comparison
1132                  in a wider type than the original type can introduce
1133                  unnecessary extensions, so picking the widest valid mode
1134                  is not always a good choice either.
1135
1136                  Here we prefer the first IV type that's Pmode or wider,
1137                  and the first comparison type that's IV_PRECISION or wider.
1138                  (The comparison type must be no wider than the IV type,
1139                  to avoid extensions in the vector loop.)
1140
1141                  ??? We might want to try continuing beyond Pmode for ILP32
1142                  targets if CMP_BITS < IV_PRECISION.  */
1143               iv_type = this_type;
1144               if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1145                 cmp_type = this_type;
1146               if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1147                 break;
1148             }
1149         }
1150     }
1151
1152   if (!cmp_type)
1153     return false;
1154
1155   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1156   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1157   return true;
1158 }
1159
1160 /* Check whether we can use vector access with length based on precison
1161    comparison.  So far, to keep it simple, we only allow the case that the
1162    precision of the target supported length is larger than the precision
1163    required by loop niters.  */
1164
1165 static bool
1166 vect_verify_loop_lens (loop_vec_info loop_vinfo)
1167 {
1168   if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1169     return false;
1170
1171   machine_mode len_load_mode = get_len_load_store_mode
1172     (loop_vinfo->vector_mode, true).require ();
1173   machine_mode len_store_mode = get_len_load_store_mode
1174     (loop_vinfo->vector_mode, false).require ();
1175
1176   signed char partial_load_bias = internal_len_load_store_bias
1177     (IFN_LEN_LOAD, len_load_mode);
1178
1179   signed char partial_store_bias = internal_len_load_store_bias
1180     (IFN_LEN_STORE, len_store_mode);
1181
1182   gcc_assert (partial_load_bias == partial_store_bias);
1183
1184   if (partial_load_bias == VECT_PARTIAL_BIAS_UNSUPPORTED)
1185     return false;
1186
1187   /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit
1188      len_loads with a length of zero.  In order to avoid that we prohibit
1189      more than one loop length here.  */
1190   if (partial_load_bias == -1
1191       && LOOP_VINFO_LENS (loop_vinfo).length () > 1)
1192     return false;
1193
1194   LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) = partial_load_bias;
1195
1196   unsigned int max_nitems_per_iter = 1;
1197   unsigned int i;
1198   rgroup_controls *rgl;
1199   /* Find the maximum number of items per iteration for every rgroup.  */
1200   FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1201     {
1202       unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1203       max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1204     }
1205
1206   /* Work out how many bits we need to represent the length limit.  */
1207   unsigned int min_ni_prec
1208     = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1209
1210   /* Now use the maximum of below precisions for one suitable IV type:
1211      - the IV's natural precision
1212      - the precision needed to hold: the maximum number of scalar
1213        iterations multiplied by the scale factor (min_ni_prec above)
1214      - the Pmode precision
1215
1216      If min_ni_prec is less than the precision of the current niters,
1217      we perfer to still use the niters type.  Prefer to use Pmode and
1218      wider IV to avoid narrow conversions.  */
1219
1220   unsigned int ni_prec
1221     = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1222   min_ni_prec = MAX (min_ni_prec, ni_prec);
1223   min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1224
1225   tree iv_type = NULL_TREE;
1226   opt_scalar_int_mode tmode_iter;
1227   FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1228     {
1229       scalar_mode tmode = tmode_iter.require ();
1230       unsigned int tbits = GET_MODE_BITSIZE (tmode);
1231
1232       /* ??? Do we really want to construct one IV whose precision exceeds
1233          BITS_PER_WORD?  */
1234       if (tbits > BITS_PER_WORD)
1235         break;
1236
1237       /* Find the first available standard integral type.  */
1238       if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1239         {
1240           iv_type = build_nonstandard_integer_type (tbits, true);
1241           break;
1242         }
1243     }
1244
1245   if (!iv_type)
1246     {
1247       if (dump_enabled_p ())
1248         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1249                          "can't vectorize with length-based partial vectors"
1250                          " because there is no suitable iv type.\n");
1251       return false;
1252     }
1253
1254   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1255   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1256
1257   return true;
1258 }
1259
1260 /* Calculate the cost of one scalar iteration of the loop.  */
1261 static void
1262 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1263 {
1264   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1265   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1266   int nbbs = loop->num_nodes, factor;
1267   int innerloop_iters, i;
1268
1269   DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1270
1271   /* Gather costs for statements in the scalar loop.  */
1272
1273   /* FORNOW.  */
1274   innerloop_iters = 1;
1275   if (loop->inner)
1276     innerloop_iters = LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo);
1277
1278   for (i = 0; i < nbbs; i++)
1279     {
1280       gimple_stmt_iterator si;
1281       basic_block bb = bbs[i];
1282
1283       if (bb->loop_father == loop->inner)
1284         factor = innerloop_iters;
1285       else
1286         factor = 1;
1287
1288       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1289         {
1290           gimple *stmt = gsi_stmt (si);
1291           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1292
1293           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1294             continue;
1295
1296           /* Skip stmts that are not vectorized inside the loop.  */
1297           stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1298           if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1299               && (!STMT_VINFO_LIVE_P (vstmt_info)
1300                   || !VECTORIZABLE_CYCLE_DEF
1301                         (STMT_VINFO_DEF_TYPE (vstmt_info))))
1302             continue;
1303
1304           vect_cost_for_stmt kind;
1305           if (STMT_VINFO_DATA_REF (stmt_info))
1306             {
1307               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1308                kind = scalar_load;
1309              else
1310                kind = scalar_store;
1311             }
1312           else if (vect_nop_conversion_p (stmt_info))
1313             continue;
1314           else
1315             kind = scalar_stmt;
1316
1317           /* We are using vect_prologue here to avoid scaling twice
1318              by the inner loop factor.  */
1319           record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1320                             factor, kind, stmt_info, 0, vect_prologue);
1321         }
1322     }
1323
1324   /* Now accumulate cost.  */
1325   loop_vinfo->scalar_costs = init_cost (loop_vinfo, true);
1326   add_stmt_costs (loop_vinfo->scalar_costs,
1327                   &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo));
1328   loop_vinfo->scalar_costs->finish_cost (nullptr);
1329 }
1330
1331
1332 /* Function vect_analyze_loop_form.
1333
1334    Verify that certain CFG restrictions hold, including:
1335    - the loop has a pre-header
1336    - the loop has a single entry and exit
1337    - the loop exit condition is simple enough
1338    - the number of iterations can be analyzed, i.e, a countable loop.  The
1339      niter could be analyzed under some assumptions.  */
1340
1341 opt_result
1342 vect_analyze_loop_form (class loop *loop, vect_loop_form_info *info)
1343 {
1344   DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1345
1346   /* Different restrictions apply when we are considering an inner-most loop,
1347      vs. an outer (nested) loop.
1348      (FORNOW. May want to relax some of these restrictions in the future).  */
1349
1350   info->inner_loop_cond = NULL;
1351   if (!loop->inner)
1352     {
1353       /* Inner-most loop.  We currently require that the number of BBs is
1354          exactly 2 (the header and latch).  Vectorizable inner-most loops
1355          look like this:
1356
1357                         (pre-header)
1358                            |
1359                           header <--------+
1360                            | |            |
1361                            | +--> latch --+
1362                            |
1363                         (exit-bb)  */
1364
1365       if (loop->num_nodes != 2)
1366         return opt_result::failure_at (vect_location,
1367                                        "not vectorized:"
1368                                        " control flow in loop.\n");
1369
1370       if (empty_block_p (loop->header))
1371         return opt_result::failure_at (vect_location,
1372                                        "not vectorized: empty loop.\n");
1373     }
1374   else
1375     {
1376       class loop *innerloop = loop->inner;
1377       edge entryedge;
1378
1379       /* Nested loop. We currently require that the loop is doubly-nested,
1380          contains a single inner loop, and the number of BBs is exactly 5.
1381          Vectorizable outer-loops look like this:
1382
1383                         (pre-header)
1384                            |
1385                           header <---+
1386                            |         |
1387                           inner-loop |
1388                            |         |
1389                           tail ------+
1390                            |
1391                         (exit-bb)
1392
1393          The inner-loop has the properties expected of inner-most loops
1394          as described above.  */
1395
1396       if ((loop->inner)->inner || (loop->inner)->next)
1397         return opt_result::failure_at (vect_location,
1398                                        "not vectorized:"
1399                                        " multiple nested loops.\n");
1400
1401       if (loop->num_nodes != 5)
1402         return opt_result::failure_at (vect_location,
1403                                        "not vectorized:"
1404                                        " control flow in loop.\n");
1405
1406       entryedge = loop_preheader_edge (innerloop);
1407       if (entryedge->src != loop->header
1408           || !single_exit (innerloop)
1409           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1410         return opt_result::failure_at (vect_location,
1411                                        "not vectorized:"
1412                                        " unsupported outerloop form.\n");
1413
1414       /* Analyze the inner-loop.  */
1415       vect_loop_form_info inner;
1416       opt_result res = vect_analyze_loop_form (loop->inner, &inner);
1417       if (!res)
1418         {
1419           if (dump_enabled_p ())
1420             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1421                              "not vectorized: Bad inner loop.\n");
1422           return res;
1423         }
1424
1425       /* Don't support analyzing niter under assumptions for inner
1426          loop.  */
1427       if (!integer_onep (inner.assumptions))
1428         return opt_result::failure_at (vect_location,
1429                                        "not vectorized: Bad inner loop.\n");
1430
1431       if (!expr_invariant_in_loop_p (loop, inner.number_of_iterations))
1432         return opt_result::failure_at (vect_location,
1433                                        "not vectorized: inner-loop count not"
1434                                        " invariant.\n");
1435
1436       if (dump_enabled_p ())
1437         dump_printf_loc (MSG_NOTE, vect_location,
1438                          "Considering outer-loop vectorization.\n");
1439       info->inner_loop_cond = inner.loop_cond;
1440     }
1441
1442   if (!single_exit (loop))
1443     return opt_result::failure_at (vect_location,
1444                                    "not vectorized: multiple exits.\n");
1445   if (EDGE_COUNT (loop->header->preds) != 2)
1446     return opt_result::failure_at (vect_location,
1447                                    "not vectorized:"
1448                                    " too many incoming edges.\n");
1449
1450   /* We assume that the loop exit condition is at the end of the loop. i.e,
1451      that the loop is represented as a do-while (with a proper if-guard
1452      before the loop if needed), where the loop header contains all the
1453      executable statements, and the latch is empty.  */
1454   if (!empty_block_p (loop->latch)
1455       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1456     return opt_result::failure_at (vect_location,
1457                                    "not vectorized: latch block not empty.\n");
1458
1459   /* Make sure the exit is not abnormal.  */
1460   edge e = single_exit (loop);
1461   if (e->flags & EDGE_ABNORMAL)
1462     return opt_result::failure_at (vect_location,
1463                                    "not vectorized:"
1464                                    " abnormal loop exit edge.\n");
1465
1466   info->loop_cond
1467     = vect_get_loop_niters (loop, &info->assumptions,
1468                             &info->number_of_iterations,
1469                             &info->number_of_iterationsm1);
1470   if (!info->loop_cond)
1471     return opt_result::failure_at
1472       (vect_location,
1473        "not vectorized: complicated exit condition.\n");
1474
1475   if (integer_zerop (info->assumptions)
1476       || !info->number_of_iterations
1477       || chrec_contains_undetermined (info->number_of_iterations))
1478     return opt_result::failure_at
1479       (info->loop_cond,
1480        "not vectorized: number of iterations cannot be computed.\n");
1481
1482   if (integer_zerop (info->number_of_iterations))
1483     return opt_result::failure_at
1484       (info->loop_cond,
1485        "not vectorized: number of iterations = 0.\n");
1486
1487   if (!(tree_fits_shwi_p (info->number_of_iterations)
1488         && tree_to_shwi (info->number_of_iterations) > 0))
1489     {
1490       if (dump_enabled_p ())
1491         {
1492           dump_printf_loc (MSG_NOTE, vect_location,
1493                            "Symbolic number of iterations is ");
1494           dump_generic_expr (MSG_NOTE, TDF_DETAILS, info->number_of_iterations);
1495           dump_printf (MSG_NOTE, "\n");
1496         }
1497     }
1498
1499   return opt_result::success ();
1500 }
1501
1502 /* Create a loop_vec_info for LOOP with SHARED and the
1503    vect_analyze_loop_form result.  */
1504
1505 loop_vec_info
1506 vect_create_loop_vinfo (class loop *loop, vec_info_shared *shared,
1507                         const vect_loop_form_info *info,
1508                         loop_vec_info main_loop_info)
1509 {
1510   loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1511   LOOP_VINFO_NITERSM1 (loop_vinfo) = info->number_of_iterationsm1;
1512   LOOP_VINFO_NITERS (loop_vinfo) = info->number_of_iterations;
1513   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = info->number_of_iterations;
1514   LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = main_loop_info;
1515   /* Also record the assumptions for versioning.  */
1516   if (!integer_onep (info->assumptions) && !main_loop_info)
1517     LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = info->assumptions;
1518
1519   stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (info->loop_cond);
1520   STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1521   if (info->inner_loop_cond)
1522     {
1523       stmt_vec_info inner_loop_cond_info
1524         = loop_vinfo->lookup_stmt (info->inner_loop_cond);
1525       STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1526       /* If we have an estimate on the number of iterations of the inner
1527          loop use that to limit the scale for costing, otherwise use
1528          --param vect-inner-loop-cost-factor literally.  */
1529       widest_int nit;
1530       if (estimated_stmt_executions (loop->inner, &nit))
1531         LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo)
1532           = wi::smin (nit, param_vect_inner_loop_cost_factor).to_uhwi ();
1533     }
1534
1535   return loop_vinfo;
1536 }
1537
1538
1539
1540 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1541    statements update the vectorization factor.  */
1542
1543 static void
1544 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1545 {
1546   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1547   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1548   int nbbs = loop->num_nodes;
1549   poly_uint64 vectorization_factor;
1550   int i;
1551
1552   DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1553
1554   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1555   gcc_assert (known_ne (vectorization_factor, 0U));
1556
1557   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1558      vectorization factor of the loop is the unrolling factor required by
1559      the SLP instances.  If that unrolling factor is 1, we say, that we
1560      perform pure SLP on loop - cross iteration parallelism is not
1561      exploited.  */
1562   bool only_slp_in_loop = true;
1563   for (i = 0; i < nbbs; i++)
1564     {
1565       basic_block bb = bbs[i];
1566       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1567            gsi_next (&si))
1568         {
1569           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1570           if (!stmt_info)
1571             continue;
1572           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1573                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1574               && !PURE_SLP_STMT (stmt_info))
1575             /* STMT needs both SLP and loop-based vectorization.  */
1576             only_slp_in_loop = false;
1577         }
1578       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1579            gsi_next (&si))
1580         {
1581           if (is_gimple_debug (gsi_stmt (si)))
1582             continue;
1583           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1584           stmt_info = vect_stmt_to_vectorize (stmt_info);
1585           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1586                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1587               && !PURE_SLP_STMT (stmt_info))
1588             /* STMT needs both SLP and loop-based vectorization.  */
1589             only_slp_in_loop = false;
1590         }
1591     }
1592
1593   if (only_slp_in_loop)
1594     {
1595       if (dump_enabled_p ())
1596         dump_printf_loc (MSG_NOTE, vect_location,
1597                          "Loop contains only SLP stmts\n");
1598       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1599     }
1600   else
1601     {
1602       if (dump_enabled_p ())
1603         dump_printf_loc (MSG_NOTE, vect_location,
1604                          "Loop contains SLP and non-SLP stmts\n");
1605       /* Both the vectorization factor and unroll factor have the form
1606          GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
1607          so they must have a common multiple.  */
1608       vectorization_factor
1609         = force_common_multiple (vectorization_factor,
1610                                  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1611     }
1612
1613   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1614   if (dump_enabled_p ())
1615     {
1616       dump_printf_loc (MSG_NOTE, vect_location,
1617                        "Updating vectorization factor to ");
1618       dump_dec (MSG_NOTE, vectorization_factor);
1619       dump_printf (MSG_NOTE, ".\n");
1620     }
1621 }
1622
1623 /* Return true if STMT_INFO describes a double reduction phi and if
1624    the other phi in the reduction is also relevant for vectorization.
1625    This rejects cases such as:
1626
1627       outer1:
1628         x_1 = PHI <x_3(outer2), ...>;
1629         ...
1630
1631       inner:
1632         x_2 = ...;
1633         ...
1634
1635       outer2:
1636         x_3 = PHI <x_2(inner)>;
1637
1638    if nothing in x_2 or elsewhere makes x_1 relevant.  */
1639
1640 static bool
1641 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1642 {
1643   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1644     return false;
1645
1646   return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1647 }
1648
1649 /* Function vect_analyze_loop_operations.
1650
1651    Scan the loop stmts and make sure they are all vectorizable.  */
1652
1653 static opt_result
1654 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1655 {
1656   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1657   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1658   int nbbs = loop->num_nodes;
1659   int i;
1660   stmt_vec_info stmt_info;
1661   bool need_to_vectorize = false;
1662   bool ok;
1663
1664   DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1665
1666   auto_vec<stmt_info_for_cost> cost_vec;
1667
1668   for (i = 0; i < nbbs; i++)
1669     {
1670       basic_block bb = bbs[i];
1671
1672       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1673            gsi_next (&si))
1674         {
1675           gphi *phi = si.phi ();
1676           ok = true;
1677
1678           stmt_info = loop_vinfo->lookup_stmt (phi);
1679           if (dump_enabled_p ())
1680             dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G", phi);
1681           if (virtual_operand_p (gimple_phi_result (phi)))
1682             continue;
1683
1684           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1685              (i.e., a phi in the tail of the outer-loop).  */
1686           if (! is_loop_header_bb_p (bb))
1687             {
1688               /* FORNOW: we currently don't support the case that these phis
1689                  are not used in the outerloop (unless it is double reduction,
1690                  i.e., this phi is vect_reduction_def), cause this case
1691                  requires to actually do something here.  */
1692               if (STMT_VINFO_LIVE_P (stmt_info)
1693                   && !vect_active_double_reduction_p (stmt_info))
1694                 return opt_result::failure_at (phi,
1695                                                "Unsupported loop-closed phi"
1696                                                " in outer-loop.\n");
1697
1698               /* If PHI is used in the outer loop, we check that its operand
1699                  is defined in the inner loop.  */
1700               if (STMT_VINFO_RELEVANT_P (stmt_info))
1701                 {
1702                   tree phi_op;
1703
1704                   if (gimple_phi_num_args (phi) != 1)
1705                     return opt_result::failure_at (phi, "unsupported phi");
1706
1707                   phi_op = PHI_ARG_DEF (phi, 0);
1708                   stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1709                   if (!op_def_info)
1710                     return opt_result::failure_at (phi, "unsupported phi\n");
1711
1712                   if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1713                       && (STMT_VINFO_RELEVANT (op_def_info)
1714                           != vect_used_in_outer_by_reduction))
1715                     return opt_result::failure_at (phi, "unsupported phi\n");
1716
1717                   if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
1718                        || (STMT_VINFO_DEF_TYPE (stmt_info)
1719                            == vect_double_reduction_def))
1720                       && !vectorizable_lc_phi (loop_vinfo,
1721                                                stmt_info, NULL, NULL))
1722                     return opt_result::failure_at (phi, "unsupported phi\n");
1723                 }
1724
1725               continue;
1726             }
1727
1728           gcc_assert (stmt_info);
1729
1730           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1731                || STMT_VINFO_LIVE_P (stmt_info))
1732               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1733             /* A scalar-dependence cycle that we don't support.  */
1734             return opt_result::failure_at (phi,
1735                                            "not vectorized:"
1736                                            " scalar dependence cycle.\n");
1737
1738           if (STMT_VINFO_RELEVANT_P (stmt_info))
1739             {
1740               need_to_vectorize = true;
1741               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1742                   && ! PURE_SLP_STMT (stmt_info))
1743                 ok = vectorizable_induction (loop_vinfo,
1744                                              stmt_info, NULL, NULL,
1745                                              &cost_vec);
1746               else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1747                         || (STMT_VINFO_DEF_TYPE (stmt_info)
1748                             == vect_double_reduction_def)
1749                         || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1750                        && ! PURE_SLP_STMT (stmt_info))
1751                 ok = vectorizable_reduction (loop_vinfo,
1752                                              stmt_info, NULL, NULL, &cost_vec);
1753             }
1754
1755           /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
1756           if (ok
1757               && STMT_VINFO_LIVE_P (stmt_info)
1758               && !PURE_SLP_STMT (stmt_info))
1759             ok = vectorizable_live_operation (loop_vinfo,
1760                                               stmt_info, NULL, NULL, NULL,
1761                                               -1, false, &cost_vec);
1762
1763           if (!ok)
1764             return opt_result::failure_at (phi,
1765                                            "not vectorized: relevant phi not "
1766                                            "supported: %G",
1767                                            static_cast <gimple *> (phi));
1768         }
1769
1770       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1771            gsi_next (&si))
1772         {
1773           gimple *stmt = gsi_stmt (si);
1774           if (!gimple_clobber_p (stmt)
1775               && !is_gimple_debug (stmt))
1776             {
1777               opt_result res
1778                 = vect_analyze_stmt (loop_vinfo,
1779                                      loop_vinfo->lookup_stmt (stmt),
1780                                      &need_to_vectorize,
1781                                      NULL, NULL, &cost_vec);
1782               if (!res)
1783                 return res;
1784             }
1785         }
1786     } /* bbs */
1787
1788   add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
1789
1790   /* All operations in the loop are either irrelevant (deal with loop
1791      control, or dead), or only used outside the loop and can be moved
1792      out of the loop (e.g. invariants, inductions).  The loop can be
1793      optimized away by scalar optimizations.  We're better off not
1794      touching this loop.  */
1795   if (!need_to_vectorize)
1796     {
1797       if (dump_enabled_p ())
1798         dump_printf_loc (MSG_NOTE, vect_location,
1799                          "All the computation can be taken out of the loop.\n");
1800       return opt_result::failure_at
1801         (vect_location,
1802          "not vectorized: redundant loop. no profit to vectorize.\n");
1803     }
1804
1805   return opt_result::success ();
1806 }
1807
1808 /* Return true if we know that the iteration count is smaller than the
1809    vectorization factor.  Return false if it isn't, or if we can't be sure
1810    either way.  */
1811
1812 static bool
1813 vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
1814 {
1815   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1816
1817   HOST_WIDE_INT max_niter;
1818   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1819     max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1820   else
1821     max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1822
1823   if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1824     return true;
1825
1826   return false;
1827 }
1828
1829 /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
1830    is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
1831    definitely no, or -1 if it's worth retrying.  */
1832
1833 static int
1834 vect_analyze_loop_costing (loop_vec_info loop_vinfo,
1835                            unsigned *suggested_unroll_factor)
1836 {
1837   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1838   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1839
1840   /* Only loops that can handle partially-populated vectors can have iteration
1841      counts less than the vectorization factor.  */
1842   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
1843     {
1844       if (vect_known_niters_smaller_than_vf (loop_vinfo))
1845         {
1846           if (dump_enabled_p ())
1847             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1848                              "not vectorized: iteration count smaller than "
1849                              "vectorization factor.\n");
1850           return 0;
1851         }
1852     }
1853
1854   /* If using the "very cheap" model. reject cases in which we'd keep
1855      a copy of the scalar code (even if we might be able to vectorize it).  */
1856   if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
1857       && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1858           || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1859           || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)))
1860     {
1861       if (dump_enabled_p ())
1862         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1863                          "some scalar iterations would need to be peeled\n");
1864       return 0;
1865     }
1866
1867   int min_profitable_iters, min_profitable_estimate;
1868   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1869                                       &min_profitable_estimate,
1870                                       suggested_unroll_factor);
1871
1872   if (min_profitable_iters < 0)
1873     {
1874       if (dump_enabled_p ())
1875         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1876                          "not vectorized: vectorization not profitable.\n");
1877       if (dump_enabled_p ())
1878         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1879                          "not vectorized: vector version will never be "
1880                          "profitable.\n");
1881       return -1;
1882     }
1883
1884   int min_scalar_loop_bound = (param_min_vect_loop_bound
1885                                * assumed_vf);
1886
1887   /* Use the cost model only if it is more conservative than user specified
1888      threshold.  */
1889   unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1890                                     min_profitable_iters);
1891
1892   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1893
1894   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1895       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1896     {
1897       if (dump_enabled_p ())
1898         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1899                          "not vectorized: vectorization not profitable.\n");
1900       if (dump_enabled_p ())
1901         dump_printf_loc (MSG_NOTE, vect_location,
1902                          "not vectorized: iteration count smaller than user "
1903                          "specified loop bound parameter or minimum profitable "
1904                          "iterations (whichever is more conservative).\n");
1905       return 0;
1906     }
1907
1908   /* The static profitablity threshold min_profitable_estimate includes
1909      the cost of having to check at runtime whether the scalar loop
1910      should be used instead.  If it turns out that we don't need or want
1911      such a check, the threshold we should use for the static estimate
1912      is simply the point at which the vector loop becomes more profitable
1913      than the scalar loop.  */
1914   if (min_profitable_estimate > min_profitable_iters
1915       && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
1916       && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
1917       && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1918       && !vect_apply_runtime_profitability_check_p (loop_vinfo))
1919     {
1920       if (dump_enabled_p ())
1921         dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
1922                          " choice between the scalar and vector loops\n");
1923       min_profitable_estimate = min_profitable_iters;
1924     }
1925
1926   /* If the vector loop needs multiple iterations to be beneficial then
1927      things are probably too close to call, and the conservative thing
1928      would be to stick with the scalar code.  */
1929   if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
1930       && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
1931     {
1932       if (dump_enabled_p ())
1933         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1934                          "one iteration of the vector loop would be"
1935                          " more expensive than the equivalent number of"
1936                          " iterations of the scalar loop\n");
1937       return 0;
1938     }
1939
1940   HOST_WIDE_INT estimated_niter;
1941
1942   /* If we are vectorizing an epilogue then we know the maximum number of
1943      scalar iterations it will cover is at least one lower than the
1944      vectorization factor of the main loop.  */
1945   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1946     estimated_niter
1947       = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
1948   else
1949     {
1950       estimated_niter = estimated_stmt_executions_int (loop);
1951       if (estimated_niter == -1)
1952         estimated_niter = likely_max_stmt_executions_int (loop);
1953     }
1954   if (estimated_niter != -1
1955       && ((unsigned HOST_WIDE_INT) estimated_niter
1956           < MAX (th, (unsigned) min_profitable_estimate)))
1957     {
1958       if (dump_enabled_p ())
1959         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1960                          "not vectorized: estimated iteration count too "
1961                          "small.\n");
1962       if (dump_enabled_p ())
1963         dump_printf_loc (MSG_NOTE, vect_location,
1964                          "not vectorized: estimated iteration count smaller "
1965                          "than specified loop bound parameter or minimum "
1966                          "profitable iterations (whichever is more "
1967                          "conservative).\n");
1968       return -1;
1969     }
1970
1971   return 1;
1972 }
1973
1974 static opt_result
1975 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1976                            vec<data_reference_p> *datarefs,
1977                            unsigned int *n_stmts)
1978 {
1979   *n_stmts = 0;
1980   for (unsigned i = 0; i < loop->num_nodes; i++)
1981     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1982          !gsi_end_p (gsi); gsi_next (&gsi))
1983       {
1984         gimple *stmt = gsi_stmt (gsi);
1985         if (is_gimple_debug (stmt))
1986           continue;
1987         ++(*n_stmts);
1988         opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
1989                                                         NULL, 0);
1990         if (!res)
1991           {
1992             if (is_gimple_call (stmt) && loop->safelen)
1993               {
1994                 tree fndecl = gimple_call_fndecl (stmt), op;
1995                 if (fndecl != NULL_TREE)
1996                   {
1997                     cgraph_node *node = cgraph_node::get (fndecl);
1998                     if (node != NULL && node->simd_clones != NULL)
1999                       {
2000                         unsigned int j, n = gimple_call_num_args (stmt);
2001                         for (j = 0; j < n; j++)
2002                           {
2003                             op = gimple_call_arg (stmt, j);
2004                             if (DECL_P (op)
2005                                 || (REFERENCE_CLASS_P (op)
2006                                     && get_base_address (op)))
2007                               break;
2008                           }
2009                         op = gimple_call_lhs (stmt);
2010                         /* Ignore #pragma omp declare simd functions
2011                            if they don't have data references in the
2012                            call stmt itself.  */
2013                         if (j == n
2014                             && !(op
2015                                  && (DECL_P (op)
2016                                      || (REFERENCE_CLASS_P (op)
2017                                          && get_base_address (op)))))
2018                           continue;
2019                       }
2020                   }
2021               }
2022             return res;
2023           }
2024         /* If dependence analysis will give up due to the limit on the
2025            number of datarefs stop here and fail fatally.  */
2026         if (datarefs->length ()
2027             > (unsigned)param_loop_max_datarefs_for_datadeps)
2028           return opt_result::failure_at (stmt, "exceeded param "
2029                                          "loop-max-datarefs-for-datadeps\n");
2030       }
2031   return opt_result::success ();
2032 }
2033
2034 /* Look for SLP-only access groups and turn each individual access into its own
2035    group.  */
2036 static void
2037 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
2038 {
2039   unsigned int i;
2040   struct data_reference *dr;
2041
2042   DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
2043
2044   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2045   FOR_EACH_VEC_ELT (datarefs, i, dr)
2046     {
2047       gcc_assert (DR_REF (dr));
2048       stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
2049
2050       /* Check if the load is a part of an interleaving chain.  */
2051       if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2052         {
2053           stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
2054           dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_element);
2055           unsigned int group_size = DR_GROUP_SIZE (first_element);
2056
2057           /* Check if SLP-only groups.  */
2058           if (!STMT_SLP_TYPE (stmt_info)
2059               && STMT_VINFO_SLP_VECT_ONLY (first_element))
2060             {
2061               /* Dissolve the group.  */
2062               STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
2063
2064               stmt_vec_info vinfo = first_element;
2065               while (vinfo)
2066                 {
2067                   stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
2068                   DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
2069                   DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2070                   DR_GROUP_SIZE (vinfo) = 1;
2071                   if (STMT_VINFO_STRIDED_P (first_element))
2072                     DR_GROUP_GAP (vinfo) = 0;
2073                   else
2074                     DR_GROUP_GAP (vinfo) = group_size - 1;
2075                   /* Duplicate and adjust alignment info, it needs to
2076                      be present on each group leader, see dr_misalignment.  */
2077                   if (vinfo != first_element)
2078                     {
2079                       dr_vec_info *dr_info2 = STMT_VINFO_DR_INFO (vinfo);
2080                       dr_info2->target_alignment = dr_info->target_alignment;
2081                       int misalignment = dr_info->misalignment;
2082                       if (misalignment != DR_MISALIGNMENT_UNKNOWN)
2083                         {
2084                           HOST_WIDE_INT diff
2085                             = (TREE_INT_CST_LOW (DR_INIT (dr_info2->dr))
2086                                - TREE_INT_CST_LOW (DR_INIT (dr_info->dr)));
2087                           unsigned HOST_WIDE_INT align_c
2088                             = dr_info->target_alignment.to_constant ();
2089                           misalignment = (misalignment + diff) % align_c;
2090                         }
2091                       dr_info2->misalignment = misalignment;
2092                     }
2093                   vinfo = next;
2094                 }
2095             }
2096         }
2097     }
2098 }
2099
2100 /* Determine if operating on full vectors for LOOP_VINFO might leave
2101    some scalar iterations still to do.  If so, decide how we should
2102    handle those scalar iterations.  The possibilities are:
2103
2104    (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
2105        In this case:
2106
2107          LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2108          LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2109          LOOP_VINFO_PEELING_FOR_NITER == false
2110
2111    (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2112        to handle the remaining scalar iterations.  In this case:
2113
2114          LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2115          LOOP_VINFO_PEELING_FOR_NITER == true
2116
2117        There are two choices:
2118
2119        (2a) Consider vectorizing the epilogue loop at the same VF as the
2120             main loop, but using partial vectors instead of full vectors.
2121             In this case:
2122
2123               LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true
2124
2125        (2b) Consider vectorizing the epilogue loop at lower VFs only.
2126             In this case:
2127
2128               LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2129
2130    When FOR_EPILOGUE_P is true, make this determination based on the
2131    assumption that LOOP_VINFO is an epilogue loop, otherwise make it
2132    based on the assumption that LOOP_VINFO is the main loop.  The caller
2133    has made sure that the number of iterations is set appropriately for
2134    this value of FOR_EPILOGUE_P.  */
2135
2136 opt_result
2137 vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo,
2138                                             bool for_epilogue_p)
2139 {
2140   /* Determine whether there would be any scalar iterations left over.  */
2141   bool need_peeling_or_partial_vectors_p
2142     = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
2143
2144   /* Decide whether to vectorize the loop with partial vectors.  */
2145   LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2146   LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2147   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2148       && need_peeling_or_partial_vectors_p)
2149     {
2150       /* For partial-vector-usage=1, try to push the handling of partial
2151          vectors to the epilogue, with the main loop continuing to operate
2152          on full vectors.
2153
2154          If we are unrolling we also do not want to use partial vectors. This
2155          is to avoid the overhead of generating multiple masks and also to
2156          avoid having to execute entire iterations of FALSE masked instructions
2157          when dealing with one or less full iterations.
2158
2159          ??? We could then end up failing to use partial vectors if we
2160          decide to peel iterations into a prologue, and if the main loop
2161          then ends up processing fewer than VF iterations.  */
2162       if ((param_vect_partial_vector_usage == 1
2163            || loop_vinfo->suggested_unroll_factor > 1)
2164           && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2165           && !vect_known_niters_smaller_than_vf (loop_vinfo))
2166         LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2167       else
2168         LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2169     }
2170
2171   if (dump_enabled_p ())
2172     {
2173       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2174         dump_printf_loc (MSG_NOTE, vect_location,
2175                          "operating on partial vectors%s.\n",
2176                          for_epilogue_p ? " for epilogue loop" : "");
2177       else
2178         dump_printf_loc (MSG_NOTE, vect_location,
2179                          "operating only on full vectors%s.\n",
2180                          for_epilogue_p ? " for epilogue loop" : "");
2181     }
2182
2183   if (for_epilogue_p)
2184     {
2185       loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2186       gcc_assert (orig_loop_vinfo);
2187       if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2188         gcc_assert (known_lt (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2189                               LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)));
2190     }
2191
2192   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2193       && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2194     {
2195       /* Check that the loop processes at least one full vector.  */
2196       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2197       tree scalar_niters = LOOP_VINFO_NITERS (loop_vinfo);
2198       if (known_lt (wi::to_widest (scalar_niters), vf))
2199         return opt_result::failure_at (vect_location,
2200                                        "loop does not have enough iterations"
2201                                        " to support vectorization.\n");
2202
2203       /* If we need to peel an extra epilogue iteration to handle data
2204          accesses with gaps, check that there are enough scalar iterations
2205          available.
2206
2207          The check above is redundant with this one when peeling for gaps,
2208          but the distinction is useful for diagnostics.  */
2209       tree scalar_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
2210       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2211           && known_lt (wi::to_widest (scalar_nitersm1), vf))
2212         return opt_result::failure_at (vect_location,
2213                                        "loop does not have enough iterations"
2214                                        " to support peeling for gaps.\n");
2215     }
2216
2217   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2218     = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2219        && need_peeling_or_partial_vectors_p);
2220
2221   return opt_result::success ();
2222 }
2223
2224 /* Function vect_analyze_loop_2.
2225
2226    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2227    for it.  The different analyses will record information in the
2228    loop_vec_info struct.  */
2229 static opt_result
2230 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal,
2231                      unsigned *suggested_unroll_factor)
2232 {
2233   opt_result ok = opt_result::success ();
2234   int res;
2235   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2236   poly_uint64 min_vf = 2;
2237   loop_vec_info orig_loop_vinfo = NULL;
2238
2239   /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2240      loop_vec_info of the first vectorized loop.  */
2241   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2242     orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2243   else
2244     orig_loop_vinfo = loop_vinfo;
2245   gcc_assert (orig_loop_vinfo);
2246
2247   /* The first group of checks is independent of the vector size.  */
2248   fatal = true;
2249
2250   if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2251       && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2252     return opt_result::failure_at (vect_location,
2253                                    "not vectorized: simd if(0)\n");
2254
2255   /* Find all data references in the loop (which correspond to vdefs/vuses)
2256      and analyze their evolution in the loop.  */
2257
2258   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2259
2260   /* Gather the data references and count stmts in the loop.  */
2261   if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2262     {
2263       opt_result res
2264         = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2265                                      &LOOP_VINFO_DATAREFS (loop_vinfo),
2266                                      &LOOP_VINFO_N_STMTS (loop_vinfo));
2267       if (!res)
2268         {
2269           if (dump_enabled_p ())
2270             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2271                              "not vectorized: loop contains function "
2272                              "calls or data references that cannot "
2273                              "be analyzed\n");
2274           return res;
2275         }
2276       loop_vinfo->shared->save_datarefs ();
2277     }
2278   else
2279     loop_vinfo->shared->check_datarefs ();
2280
2281   /* Analyze the data references and also adjust the minimal
2282      vectorization factor according to the loads and stores.  */
2283
2284   ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
2285   if (!ok)
2286     {
2287       if (dump_enabled_p ())
2288         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2289                          "bad data references.\n");
2290       return ok;
2291     }
2292
2293   /* Classify all cross-iteration scalar data-flow cycles.
2294      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
2295   vect_analyze_scalar_cycles (loop_vinfo);
2296
2297   vect_pattern_recog (loop_vinfo);
2298
2299   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2300
2301   /* Analyze the access patterns of the data-refs in the loop (consecutive,
2302      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
2303
2304   ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
2305   if (!ok)
2306     {
2307       if (dump_enabled_p ())
2308         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2309                          "bad data access.\n");
2310       return ok;
2311     }
2312
2313   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
2314
2315   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2316   if (!ok)
2317     {
2318       if (dump_enabled_p ())
2319         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2320                          "unexpected pattern.\n");
2321       return ok;
2322     }
2323
2324   /* While the rest of the analysis below depends on it in some way.  */
2325   fatal = false;
2326
2327   /* Analyze data dependences between the data-refs in the loop
2328      and adjust the maximum vectorization factor according to
2329      the dependences.
2330      FORNOW: fail at the first data dependence that we encounter.  */
2331
2332   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2333   if (!ok)
2334     {
2335       if (dump_enabled_p ())
2336         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2337                          "bad data dependence.\n");
2338       return ok;
2339     }
2340   if (max_vf != MAX_VECTORIZATION_FACTOR
2341       && maybe_lt (max_vf, min_vf))
2342     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2343   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2344
2345   ok = vect_determine_vectorization_factor (loop_vinfo);
2346   if (!ok)
2347     {
2348       if (dump_enabled_p ())
2349         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2350                          "can't determine vectorization factor.\n");
2351       return ok;
2352     }
2353   if (max_vf != MAX_VECTORIZATION_FACTOR
2354       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2355     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2356
2357   /* Compute the scalar iteration cost.  */
2358   vect_compute_single_scalar_iteration_cost (loop_vinfo);
2359
2360   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2361
2362   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
2363   ok = vect_analyze_slp (loop_vinfo, LOOP_VINFO_N_STMTS (loop_vinfo));
2364   if (!ok)
2365     return ok;
2366
2367   /* If there are any SLP instances mark them as pure_slp.  */
2368   bool slp = vect_make_slp_decision (loop_vinfo);
2369   if (slp)
2370     {
2371       /* Find stmts that need to be both vectorized and SLPed.  */
2372       vect_detect_hybrid_slp (loop_vinfo);
2373
2374       /* Update the vectorization factor based on the SLP decision.  */
2375       vect_update_vf_for_slp (loop_vinfo);
2376
2377       /* Optimize the SLP graph with the vectorization factor fixed.  */
2378       vect_optimize_slp (loop_vinfo);
2379
2380       /* Gather the loads reachable from the SLP graph entries.  */
2381       vect_gather_slp_loads (loop_vinfo);
2382     }
2383
2384   bool saved_can_use_partial_vectors_p
2385     = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2386
2387   /* We don't expect to have to roll back to anything other than an empty
2388      set of rgroups.  */
2389   gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2390
2391   /* Apply the suggested unrolling factor, this was determined by the backend
2392      during finish_cost the first time we ran the analyzis for this
2393      vector mode.  */
2394   if (loop_vinfo->suggested_unroll_factor > 1)
2395     LOOP_VINFO_VECT_FACTOR (loop_vinfo) *= loop_vinfo->suggested_unroll_factor;
2396
2397   /* This is the point where we can re-start analysis with SLP forced off.  */
2398 start_over:
2399
2400   /* Now the vectorization factor is final.  */
2401   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2402   gcc_assert (known_ne (vectorization_factor, 0U));
2403
2404   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2405     {
2406       dump_printf_loc (MSG_NOTE, vect_location,
2407                        "vectorization_factor = ");
2408       dump_dec (MSG_NOTE, vectorization_factor);
2409       dump_printf (MSG_NOTE, ", niters = %wd\n",
2410                    LOOP_VINFO_INT_NITERS (loop_vinfo));
2411     }
2412
2413   loop_vinfo->vector_costs = init_cost (loop_vinfo, false);
2414
2415   /* Analyze the alignment of the data-refs in the loop.
2416      Fail if a data reference is found that cannot be vectorized.  */
2417
2418   ok = vect_analyze_data_refs_alignment (loop_vinfo);
2419   if (!ok)
2420     {
2421       if (dump_enabled_p ())
2422         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2423                          "bad data alignment.\n");
2424       return ok;
2425     }
2426
2427   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2428      It is important to call pruning after vect_analyze_data_ref_accesses,
2429      since we use grouping information gathered by interleaving analysis.  */
2430   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2431   if (!ok)
2432     return ok;
2433
2434   /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2435      vectorization, since we do not want to add extra peeling or
2436      add versioning for alignment.  */
2437   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2438     /* This pass will decide on using loop versioning and/or loop peeling in
2439        order to enhance the alignment of data references in the loop.  */
2440     ok = vect_enhance_data_refs_alignment (loop_vinfo);
2441   if (!ok)
2442     return ok;
2443
2444   if (slp)
2445     {
2446       /* Analyze operations in the SLP instances.  Note this may
2447          remove unsupported SLP instances which makes the above
2448          SLP kind detection invalid.  */
2449       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2450       vect_slp_analyze_operations (loop_vinfo);
2451       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2452         {
2453           ok = opt_result::failure_at (vect_location,
2454                                        "unsupported SLP instances\n");
2455           goto again;
2456         }
2457
2458       /* Check whether any load in ALL SLP instances is possibly permuted.  */
2459       slp_tree load_node, slp_root;
2460       unsigned i, x;
2461       slp_instance instance;
2462       bool can_use_lanes = true;
2463       FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), x, instance)
2464         {
2465           slp_root = SLP_INSTANCE_TREE (instance);
2466           int group_size = SLP_TREE_LANES (slp_root);
2467           tree vectype = SLP_TREE_VECTYPE (slp_root);
2468           bool loads_permuted = false;
2469           FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2470             {
2471               if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
2472                 continue;
2473               unsigned j;
2474               stmt_vec_info load_info;
2475               FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2476                 if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
2477                   {
2478                     loads_permuted = true;
2479                     break;
2480                   }
2481             }
2482
2483           /* If the loads and stores can be handled with load/store-lane
2484              instructions record it and move on to the next instance.  */
2485           if (loads_permuted
2486               && SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2487               && vect_store_lanes_supported (vectype, group_size, false))
2488             {
2489               FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2490                 {
2491                   stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2492                       (SLP_TREE_SCALAR_STMTS (load_node)[0]);
2493                   /* Use SLP for strided accesses (or if we can't
2494                      load-lanes).  */
2495                   if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2496                       || ! vect_load_lanes_supported
2497                             (STMT_VINFO_VECTYPE (stmt_vinfo),
2498                              DR_GROUP_SIZE (stmt_vinfo), false))
2499                     break;
2500                 }
2501
2502               can_use_lanes
2503                 = can_use_lanes && i == SLP_INSTANCE_LOADS (instance).length ();
2504
2505               if (can_use_lanes && dump_enabled_p ())
2506                 dump_printf_loc (MSG_NOTE, vect_location,
2507                                  "SLP instance %p can use load/store-lanes\n",
2508                                  instance);
2509             }
2510           else
2511             {
2512               can_use_lanes = false;
2513               break;
2514             }
2515         }
2516
2517       /* If all SLP instances can use load/store-lanes abort SLP and try again
2518          with SLP disabled.  */
2519       if (can_use_lanes)
2520         {
2521           ok = opt_result::failure_at (vect_location,
2522                                        "Built SLP cancelled: can use "
2523                                        "load/store-lanes\n");
2524           if (dump_enabled_p ())
2525             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2526                              "Built SLP cancelled: all SLP instances support "
2527                              "load/store-lanes\n");
2528           goto again;
2529         }
2530     }
2531
2532   /* Dissolve SLP-only groups.  */
2533   vect_dissolve_slp_only_groups (loop_vinfo);
2534
2535   /* Scan all the remaining operations in the loop that are not subject
2536      to SLP and make sure they are vectorizable.  */
2537   ok = vect_analyze_loop_operations (loop_vinfo);
2538   if (!ok)
2539     {
2540       if (dump_enabled_p ())
2541         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2542                          "bad operation or unsupported loop bound.\n");
2543       return ok;
2544     }
2545
2546   /* For now, we don't expect to mix both masking and length approaches for one
2547      loop, disable it if both are recorded.  */
2548   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2549       && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
2550       && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
2551     {
2552       if (dump_enabled_p ())
2553         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2554                          "can't vectorize a loop with partial vectors"
2555                          " because we don't expect to mix different"
2556                          " approaches with partial vectors for the"
2557                          " same loop.\n");
2558       LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2559     }
2560
2561   /* If we still have the option of using partial vectors,
2562      check whether we can generate the necessary loop controls.  */
2563   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2564       && !vect_verify_full_masking (loop_vinfo)
2565       && !vect_verify_loop_lens (loop_vinfo))
2566     LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2567
2568   /* If we're vectorizing an epilogue loop, the vectorized loop either needs
2569      to be able to handle fewer than VF scalars, or needs to have a lower VF
2570      than the main loop.  */
2571   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2572       && !LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2573       && maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2574                    LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)))
2575     return opt_result::failure_at (vect_location,
2576                                    "Vectorization factor too high for"
2577                                    " epilogue loop.\n");
2578
2579   /* Decide whether this loop_vinfo should use partial vectors or peeling,
2580      assuming that the loop will be used as a main loop.  We will redo
2581      this analysis later if we instead decide to use the loop as an
2582      epilogue loop.  */
2583   ok = vect_determine_partial_vectors_and_peeling (loop_vinfo, false);
2584   if (!ok)
2585     return ok;
2586
2587   /* Check the costings of the loop make vectorizing worthwhile.  */
2588   res = vect_analyze_loop_costing (loop_vinfo, suggested_unroll_factor);
2589   if (res < 0)
2590     {
2591       ok = opt_result::failure_at (vect_location,
2592                                    "Loop costings may not be worthwhile.\n");
2593       goto again;
2594     }
2595   if (!res)
2596     return opt_result::failure_at (vect_location,
2597                                    "Loop costings not worthwhile.\n");
2598
2599   /* If an epilogue loop is required make sure we can create one.  */
2600   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2601       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2602     {
2603       if (dump_enabled_p ())
2604         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2605       if (!vect_can_advance_ivs_p (loop_vinfo)
2606           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2607                                            single_exit (LOOP_VINFO_LOOP
2608                                                          (loop_vinfo))))
2609         {
2610           ok = opt_result::failure_at (vect_location,
2611                                        "not vectorized: can't create required "
2612                                        "epilog loop\n");
2613           goto again;
2614         }
2615     }
2616
2617   /* During peeling, we need to check if number of loop iterations is
2618      enough for both peeled prolog loop and vector loop.  This check
2619      can be merged along with threshold check of loop versioning, so
2620      increase threshold for this case if necessary.
2621
2622      If we are analyzing an epilogue we still want to check what its
2623      versioning threshold would be.  If we decide to vectorize the epilogues we
2624      will want to use the lowest versioning threshold of all epilogues and main
2625      loop.  This will enable us to enter a vectorized epilogue even when
2626      versioning the loop.  We can't simply check whether the epilogue requires
2627      versioning though since we may have skipped some versioning checks when
2628      analyzing the epilogue.  For instance, checks for alias versioning will be
2629      skipped when dealing with epilogues as we assume we already checked them
2630      for the main loop.  So instead we always check the 'orig_loop_vinfo'.  */
2631   if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
2632     {
2633       poly_uint64 niters_th = 0;
2634       unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2635
2636       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2637         {
2638           /* Niters for peeled prolog loop.  */
2639           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2640             {
2641               dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2642               tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2643               niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2644             }
2645           else
2646             niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2647         }
2648
2649       /* Niters for at least one iteration of vectorized loop.  */
2650       if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2651         niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2652       /* One additional iteration because of peeling for gap.  */
2653       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2654         niters_th += 1;
2655
2656       /*  Use the same condition as vect_transform_loop to decide when to use
2657           the cost to determine a versioning threshold.  */
2658       if (vect_apply_runtime_profitability_check_p (loop_vinfo)
2659           && ordered_p (th, niters_th))
2660         niters_th = ordered_max (poly_uint64 (th), niters_th);
2661
2662       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2663     }
2664
2665   gcc_assert (known_eq (vectorization_factor,
2666                         LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2667
2668   /* Ok to vectorize!  */
2669   LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2670   return opt_result::success ();
2671
2672 again:
2673   /* Ensure that "ok" is false (with an opt_problem if dumping is enabled).  */
2674   gcc_assert (!ok);
2675
2676   /* Try again with SLP forced off but if we didn't do any SLP there is
2677      no point in re-trying.  */
2678   if (!slp)
2679     return ok;
2680
2681   /* If there are reduction chains re-trying will fail anyway.  */
2682   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2683     return ok;
2684
2685   /* Likewise if the grouped loads or stores in the SLP cannot be handled
2686      via interleaving or lane instructions.  */
2687   slp_instance instance;
2688   slp_tree node;
2689   unsigned i, j;
2690   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2691     {
2692       stmt_vec_info vinfo;
2693       vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2694       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2695         continue;
2696       vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2697       unsigned int size = DR_GROUP_SIZE (vinfo);
2698       tree vectype = STMT_VINFO_VECTYPE (vinfo);
2699       if (! vect_store_lanes_supported (vectype, size, false)
2700          && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2701          && ! vect_grouped_store_supported (vectype, size))
2702         return opt_result::failure_at (vinfo->stmt,
2703                                        "unsupported grouped store\n");
2704       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2705         {
2706           vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
2707           vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2708           bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2709           size = DR_GROUP_SIZE (vinfo);
2710           vectype = STMT_VINFO_VECTYPE (vinfo);
2711           if (! vect_load_lanes_supported (vectype, size, false)
2712               && ! vect_grouped_load_supported (vectype, single_element_p,
2713                                                 size))
2714             return opt_result::failure_at (vinfo->stmt,
2715                                            "unsupported grouped load\n");
2716         }
2717     }
2718
2719   if (dump_enabled_p ())
2720     dump_printf_loc (MSG_NOTE, vect_location,
2721                      "re-trying with SLP disabled\n");
2722
2723   /* Roll back state appropriately.  No SLP this time.  */
2724   slp = false;
2725   /* Restore vectorization factor as it were without SLP.  */
2726   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2727   /* Free the SLP instances.  */
2728   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2729     vect_free_slp_instance (instance);
2730   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2731   /* Reset SLP type to loop_vect on all stmts.  */
2732   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2733     {
2734       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2735       for (gimple_stmt_iterator si = gsi_start_phis (bb);
2736            !gsi_end_p (si); gsi_next (&si))
2737         {
2738           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2739           STMT_SLP_TYPE (stmt_info) = loop_vect;
2740           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2741               || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
2742             {
2743               /* vectorizable_reduction adjusts reduction stmt def-types,
2744                  restore them to that of the PHI.  */
2745               STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
2746                 = STMT_VINFO_DEF_TYPE (stmt_info);
2747               STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
2748                                         (STMT_VINFO_REDUC_DEF (stmt_info)))
2749                 = STMT_VINFO_DEF_TYPE (stmt_info);
2750             }
2751         }
2752       for (gimple_stmt_iterator si = gsi_start_bb (bb);
2753            !gsi_end_p (si); gsi_next (&si))
2754         {
2755           if (is_gimple_debug (gsi_stmt (si)))
2756             continue;
2757           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2758           STMT_SLP_TYPE (stmt_info) = loop_vect;
2759           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2760             {
2761               stmt_vec_info pattern_stmt_info
2762                 = STMT_VINFO_RELATED_STMT (stmt_info);
2763               if (STMT_VINFO_SLP_VECT_ONLY_PATTERN (pattern_stmt_info))
2764                 STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
2765
2766               gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2767               STMT_SLP_TYPE (pattern_stmt_info) = loop_vect;
2768               for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2769                    !gsi_end_p (pi); gsi_next (&pi))
2770                 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2771                   = loop_vect;
2772             }
2773         }
2774     }
2775   /* Free optimized alias test DDRS.  */
2776   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2777   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2778   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2779   /* Reset target cost data.  */
2780   delete loop_vinfo->vector_costs;
2781   loop_vinfo->vector_costs = nullptr;
2782   /* Reset accumulated rgroup information.  */
2783   release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo));
2784   release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
2785   /* Reset assorted flags.  */
2786   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2787   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2788   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2789   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2790   LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2791     = saved_can_use_partial_vectors_p;
2792
2793   goto start_over;
2794 }
2795
2796 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
2797    to be better than vectorizing it using OLD_LOOP_VINFO.  Assume that
2798    OLD_LOOP_VINFO is better unless something specifically indicates
2799    otherwise.
2800
2801    Note that this deliberately isn't a partial order.  */
2802
2803 static bool
2804 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
2805                           loop_vec_info old_loop_vinfo)
2806 {
2807   struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
2808   gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
2809
2810   poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
2811   poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
2812
2813   /* Always prefer a VF of loop->simdlen over any other VF.  */
2814   if (loop->simdlen)
2815     {
2816       bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
2817       bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
2818       if (new_simdlen_p != old_simdlen_p)
2819         return new_simdlen_p;
2820     }
2821
2822   const auto *old_costs = old_loop_vinfo->vector_costs;
2823   const auto *new_costs = new_loop_vinfo->vector_costs;
2824   if (loop_vec_info main_loop = LOOP_VINFO_ORIG_LOOP_INFO (old_loop_vinfo))
2825     return new_costs->better_epilogue_loop_than_p (old_costs, main_loop);
2826
2827   return new_costs->better_main_loop_than_p (old_costs);
2828 }
2829
2830 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO.  Return
2831    true if we should.  */
2832
2833 static bool
2834 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
2835                         loop_vec_info old_loop_vinfo)
2836 {
2837   if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
2838     return false;
2839
2840   if (dump_enabled_p ())
2841     dump_printf_loc (MSG_NOTE, vect_location,
2842                      "***** Preferring vector mode %s to vector mode %s\n",
2843                      GET_MODE_NAME (new_loop_vinfo->vector_mode),
2844                      GET_MODE_NAME (old_loop_vinfo->vector_mode));
2845   return true;
2846 }
2847
2848 /* Analyze LOOP with VECTOR_MODES[MODE_I] and as epilogue if MAIN_LOOP_VINFO is
2849    not NULL.  Set AUTODETECTED_VECTOR_MODE if VOIDmode and advance
2850    MODE_I to the next mode useful to analyze.
2851    Return the loop_vinfo on success and wrapped null on failure.  */
2852
2853 static opt_loop_vec_info
2854 vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared,
2855                      const vect_loop_form_info *loop_form_info,
2856                      loop_vec_info main_loop_vinfo,
2857                      const vector_modes &vector_modes, unsigned &mode_i,
2858                      machine_mode &autodetected_vector_mode,
2859                      bool &fatal)
2860 {
2861   loop_vec_info loop_vinfo
2862     = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
2863
2864   machine_mode vector_mode = vector_modes[mode_i];
2865   loop_vinfo->vector_mode = vector_mode;
2866   unsigned int suggested_unroll_factor = 1;
2867
2868   /* Run the main analysis.  */
2869   opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal,
2870                                         &suggested_unroll_factor);
2871   if (dump_enabled_p ())
2872     dump_printf_loc (MSG_NOTE, vect_location,
2873                      "***** Analysis %s with vector mode %s\n",
2874                      res ? "succeeded" : " failed",
2875                      GET_MODE_NAME (loop_vinfo->vector_mode));
2876
2877   if (!main_loop_vinfo && suggested_unroll_factor > 1)
2878     {
2879       if (dump_enabled_p ())
2880         dump_printf_loc (MSG_NOTE, vect_location,
2881                          "***** Re-trying analysis for unrolling"
2882                          " with unroll factor %d.\n",
2883                          suggested_unroll_factor);
2884       loop_vec_info unroll_vinfo
2885         = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
2886       unroll_vinfo->vector_mode = vector_mode;
2887       unroll_vinfo->suggested_unroll_factor = suggested_unroll_factor;
2888       opt_result new_res = vect_analyze_loop_2 (unroll_vinfo, fatal, NULL);
2889       if (new_res)
2890         {
2891           delete loop_vinfo;
2892           loop_vinfo = unroll_vinfo;
2893         }
2894       else
2895         delete unroll_vinfo;
2896     }
2897
2898   /* Remember the autodetected vector mode.  */
2899   if (vector_mode == VOIDmode)
2900     autodetected_vector_mode = loop_vinfo->vector_mode;
2901
2902   /* Advance mode_i, first skipping modes that would result in the
2903      same analysis result.  */
2904   while (mode_i + 1 < vector_modes.length ()
2905          && vect_chooses_same_modes_p (loop_vinfo,
2906                                        vector_modes[mode_i + 1]))
2907     {
2908       if (dump_enabled_p ())
2909         dump_printf_loc (MSG_NOTE, vect_location,
2910                          "***** The result for vector mode %s would"
2911                          " be the same\n",
2912                          GET_MODE_NAME (vector_modes[mode_i + 1]));
2913       mode_i += 1;
2914     }
2915   if (mode_i + 1 < vector_modes.length ()
2916       && VECTOR_MODE_P (autodetected_vector_mode)
2917       && (related_vector_mode (vector_modes[mode_i + 1],
2918                                GET_MODE_INNER (autodetected_vector_mode))
2919           == autodetected_vector_mode)
2920       && (related_vector_mode (autodetected_vector_mode,
2921                                GET_MODE_INNER (vector_modes[mode_i + 1]))
2922           == vector_modes[mode_i + 1]))
2923     {
2924       if (dump_enabled_p ())
2925         dump_printf_loc (MSG_NOTE, vect_location,
2926                          "***** Skipping vector mode %s, which would"
2927                          " repeat the analysis for %s\n",
2928                          GET_MODE_NAME (vector_modes[mode_i + 1]),
2929                          GET_MODE_NAME (autodetected_vector_mode));
2930       mode_i += 1;
2931     }
2932   mode_i++;
2933
2934   if (!res)
2935     {
2936       delete loop_vinfo;
2937       if (fatal)
2938         gcc_checking_assert (main_loop_vinfo == NULL);
2939       return opt_loop_vec_info::propagate_failure (res);
2940     }
2941
2942   return opt_loop_vec_info::success (loop_vinfo);
2943 }
2944
2945 /* Function vect_analyze_loop.
2946
2947    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2948    for it.  The different analyses will record information in the
2949    loop_vec_info struct.  */
2950 opt_loop_vec_info
2951 vect_analyze_loop (class loop *loop, vec_info_shared *shared)
2952 {
2953   DUMP_VECT_SCOPE ("analyze_loop_nest");
2954
2955   if (loop_outer (loop)
2956       && loop_vec_info_for_loop (loop_outer (loop))
2957       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2958     return opt_loop_vec_info::failure_at (vect_location,
2959                                           "outer-loop already vectorized.\n");
2960
2961   if (!find_loop_nest (loop, &shared->loop_nest))
2962     return opt_loop_vec_info::failure_at
2963       (vect_location,
2964        "not vectorized: loop nest containing two or more consecutive inner"
2965        " loops cannot be vectorized\n");
2966
2967   /* Analyze the loop form.  */
2968   vect_loop_form_info loop_form_info;
2969   opt_result res = vect_analyze_loop_form (loop, &loop_form_info);
2970   if (!res)
2971     {
2972       if (dump_enabled_p ())
2973         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2974                          "bad loop form.\n");
2975       return opt_loop_vec_info::propagate_failure (res);
2976     }
2977   if (!integer_onep (loop_form_info.assumptions))
2978     {
2979       /* We consider to vectorize this loop by versioning it under
2980          some assumptions.  In order to do this, we need to clear
2981          existing information computed by scev and niter analyzer.  */
2982       scev_reset_htab ();
2983       free_numbers_of_iterations_estimates (loop);
2984       /* Also set flag for this loop so that following scev and niter
2985          analysis are done under the assumptions.  */
2986       loop_constraint_set (loop, LOOP_C_FINITE);
2987     }
2988
2989   auto_vector_modes vector_modes;
2990   /* Autodetect first vector size we try.  */
2991   vector_modes.safe_push (VOIDmode);
2992   unsigned int autovec_flags
2993     = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
2994                                                     loop->simdlen != 0);
2995   bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
2996                              && !unlimited_cost_model (loop));
2997   machine_mode autodetected_vector_mode = VOIDmode;
2998   opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
2999   unsigned int mode_i = 0;
3000   unsigned HOST_WIDE_INT simdlen = loop->simdlen;
3001
3002   /* Keep track of the VF for each mode.  Initialize all to 0 which indicates
3003      a mode has not been analyzed.  */
3004   auto_vec<poly_uint64, 8> cached_vf_per_mode;
3005   for (unsigned i = 0; i < vector_modes.length (); ++i)
3006     cached_vf_per_mode.safe_push (0);
3007
3008   /* First determine the main loop vectorization mode, either the first
3009      one that works, starting with auto-detecting the vector mode and then
3010      following the targets order of preference, or the one with the
3011      lowest cost if pick_lowest_cost_p.  */
3012   while (1)
3013     {
3014       bool fatal;
3015       unsigned int last_mode_i = mode_i;
3016       /* Set cached VF to -1 prior to analysis, which indicates a mode has
3017          failed.  */
3018       cached_vf_per_mode[last_mode_i] = -1;
3019       opt_loop_vec_info loop_vinfo
3020         = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3021                                NULL, vector_modes, mode_i,
3022                                autodetected_vector_mode, fatal);
3023       if (fatal)
3024         break;
3025
3026       if (loop_vinfo)
3027         {
3028           /*  Analyzis has been successful so update the VF value.  The
3029               VF should always be a multiple of unroll_factor and we want to
3030               capture the original VF here.  */
3031           cached_vf_per_mode[last_mode_i]
3032             = exact_div (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
3033                          loop_vinfo->suggested_unroll_factor);
3034           /* Once we hit the desired simdlen for the first time,
3035              discard any previous attempts.  */
3036           if (simdlen
3037               && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
3038             {
3039               delete first_loop_vinfo;
3040               first_loop_vinfo = opt_loop_vec_info::success (NULL);
3041               simdlen = 0;
3042             }
3043           else if (pick_lowest_cost_p
3044                    && first_loop_vinfo
3045                    && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
3046             {
3047               /* Pick loop_vinfo over first_loop_vinfo.  */
3048               delete first_loop_vinfo;
3049               first_loop_vinfo = opt_loop_vec_info::success (NULL);
3050             }
3051           if (first_loop_vinfo == NULL)
3052             first_loop_vinfo = loop_vinfo;
3053           else
3054             {
3055               delete loop_vinfo;
3056               loop_vinfo = opt_loop_vec_info::success (NULL);
3057             }
3058
3059           /* Commit to first_loop_vinfo if we have no reason to try
3060              alternatives.  */
3061           if (!simdlen && !pick_lowest_cost_p)
3062             break;
3063         }
3064       if (mode_i == vector_modes.length ()
3065           || autodetected_vector_mode == VOIDmode)
3066         break;
3067
3068       /* Try the next biggest vector size.  */
3069       if (dump_enabled_p ())
3070         dump_printf_loc (MSG_NOTE, vect_location,
3071                          "***** Re-trying analysis with vector mode %s\n",
3072                          GET_MODE_NAME (vector_modes[mode_i]));
3073     }
3074   if (!first_loop_vinfo)
3075     return opt_loop_vec_info::propagate_failure (res);
3076
3077   if (dump_enabled_p ())
3078     dump_printf_loc (MSG_NOTE, vect_location,
3079                      "***** Choosing vector mode %s\n",
3080                      GET_MODE_NAME (first_loop_vinfo->vector_mode));
3081
3082   /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
3083      enabled, SIMDUID is not set, it is the innermost loop and we have
3084      either already found the loop's SIMDLEN or there was no SIMDLEN to
3085      begin with.
3086      TODO: Enable epilogue vectorization for loops with SIMDUID set.  */
3087   bool vect_epilogues = (!simdlen
3088                          && loop->inner == NULL
3089                          && param_vect_epilogues_nomask
3090                          && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
3091                          && !loop->simduid);
3092   if (!vect_epilogues)
3093     return first_loop_vinfo;
3094
3095   /* Now analyze first_loop_vinfo for epilogue vectorization.  */
3096   poly_uint64 lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
3097
3098   /* For epilogues start the analysis from the first mode.  The motivation
3099      behind starting from the beginning comes from cases where the VECTOR_MODES
3100      array may contain length-agnostic and length-specific modes.  Their
3101      ordering is not guaranteed, so we could end up picking a mode for the main
3102      loop that is after the epilogue's optimal mode.  */
3103   vector_modes[0] = autodetected_vector_mode;
3104   mode_i = 0;
3105
3106   bool supports_partial_vectors =
3107     partial_vectors_supported_p () && param_vect_partial_vector_usage != 0;
3108   poly_uint64 first_vinfo_vf = LOOP_VINFO_VECT_FACTOR (first_loop_vinfo);
3109
3110   while (1)
3111     {
3112       /* If the target does not support partial vectors we can shorten the
3113          number of modes to analyze for the epilogue as we know we can't pick a
3114          mode that would lead to a VF at least as big as the
3115          FIRST_VINFO_VF.  */
3116       if (!supports_partial_vectors
3117           && maybe_ge (cached_vf_per_mode[mode_i], first_vinfo_vf))
3118         {
3119           mode_i++;
3120           if (mode_i == vector_modes.length ())
3121             break;
3122           continue;
3123         }
3124
3125       if (dump_enabled_p ())
3126         dump_printf_loc (MSG_NOTE, vect_location,
3127                          "***** Re-trying epilogue analysis with vector "
3128                          "mode %s\n", GET_MODE_NAME (vector_modes[mode_i]));
3129
3130       bool fatal;
3131       opt_loop_vec_info loop_vinfo
3132         = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3133                                first_loop_vinfo,
3134                                vector_modes, mode_i,
3135                                autodetected_vector_mode, fatal);
3136       if (fatal)
3137         break;
3138
3139       if (loop_vinfo)
3140         {
3141           if (pick_lowest_cost_p)
3142             {
3143               /* Keep trying to roll back vectorization attempts while the
3144                  loop_vec_infos they produced were worse than this one.  */
3145               vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
3146               while (!vinfos.is_empty ()
3147                      && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
3148                 {
3149                   gcc_assert (vect_epilogues);
3150                   delete vinfos.pop ();
3151                 }
3152             }
3153           /* For now only allow one epilogue loop.  */
3154           if (first_loop_vinfo->epilogue_vinfos.is_empty ())
3155             {
3156               first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
3157               poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3158               gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
3159                           || maybe_ne (lowest_th, 0U));
3160               /* Keep track of the known smallest versioning
3161                  threshold.  */
3162               if (ordered_p (lowest_th, th))
3163                 lowest_th = ordered_min (lowest_th, th);
3164             }
3165           else
3166             {
3167               delete loop_vinfo;
3168               loop_vinfo = opt_loop_vec_info::success (NULL);
3169             }
3170
3171           /* For now only allow one epilogue loop, but allow
3172              pick_lowest_cost_p to replace it, so commit to the
3173              first epilogue if we have no reason to try alternatives.  */
3174           if (!pick_lowest_cost_p)
3175             break;
3176         }
3177
3178       if (mode_i == vector_modes.length ())
3179         break;
3180
3181     }
3182
3183   if (!first_loop_vinfo->epilogue_vinfos.is_empty ())
3184     {
3185       LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
3186       if (dump_enabled_p ())
3187         dump_printf_loc (MSG_NOTE, vect_location,
3188                          "***** Choosing epilogue vector mode %s\n",
3189                          GET_MODE_NAME
3190                            (first_loop_vinfo->epilogue_vinfos[0]->vector_mode));
3191     }
3192
3193   return first_loop_vinfo;
3194 }
3195
3196 /* Return true if there is an in-order reduction function for CODE, storing
3197    it in *REDUC_FN if so.  */
3198
3199 static bool
3200 fold_left_reduction_fn (code_helper code, internal_fn *reduc_fn)
3201 {
3202   if (code == PLUS_EXPR)
3203     {
3204       *reduc_fn = IFN_FOLD_LEFT_PLUS;
3205       return true;
3206     }
3207   return false;
3208 }
3209
3210 /* Function reduction_fn_for_scalar_code
3211
3212    Input:
3213    CODE - tree_code of a reduction operations.
3214
3215    Output:
3216    REDUC_FN - the corresponding internal function to be used to reduce the
3217       vector of partial results into a single scalar result, or IFN_LAST
3218       if the operation is a supported reduction operation, but does not have
3219       such an internal function.
3220
3221    Return FALSE if CODE currently cannot be vectorized as reduction.  */
3222
3223 bool
3224 reduction_fn_for_scalar_code (code_helper code, internal_fn *reduc_fn)
3225 {
3226   if (code.is_tree_code ())
3227     switch (tree_code (code))
3228       {
3229       case MAX_EXPR:
3230         *reduc_fn = IFN_REDUC_MAX;
3231         return true;
3232
3233       case MIN_EXPR:
3234         *reduc_fn = IFN_REDUC_MIN;
3235         return true;
3236
3237       case PLUS_EXPR:
3238         *reduc_fn = IFN_REDUC_PLUS;
3239         return true;
3240
3241       case BIT_AND_EXPR:
3242         *reduc_fn = IFN_REDUC_AND;
3243         return true;
3244
3245       case BIT_IOR_EXPR:
3246         *reduc_fn = IFN_REDUC_IOR;
3247         return true;
3248
3249       case BIT_XOR_EXPR:
3250         *reduc_fn = IFN_REDUC_XOR;
3251         return true;
3252
3253       case MULT_EXPR:
3254       case MINUS_EXPR:
3255         *reduc_fn = IFN_LAST;
3256         return true;
3257
3258       default:
3259         return false;
3260       }
3261   else
3262     switch (combined_fn (code))
3263       {
3264       CASE_CFN_FMAX:
3265         *reduc_fn = IFN_REDUC_FMAX;
3266         return true;
3267
3268       CASE_CFN_FMIN:
3269         *reduc_fn = IFN_REDUC_FMIN;
3270         return true;
3271
3272       default:
3273         return false;
3274       }
3275 }
3276
3277 /* If there is a neutral value X such that a reduction would not be affected
3278    by the introduction of additional X elements, return that X, otherwise
3279    return null.  CODE is the code of the reduction and SCALAR_TYPE is type
3280    of the scalar elements.  If the reduction has just a single initial value
3281    then INITIAL_VALUE is that value, otherwise it is null.  */
3282
3283 tree
3284 neutral_op_for_reduction (tree scalar_type, code_helper code,
3285                           tree initial_value)
3286 {
3287   if (code.is_tree_code ())
3288     switch (tree_code (code))
3289       {
3290       case WIDEN_SUM_EXPR:
3291       case DOT_PROD_EXPR:
3292       case SAD_EXPR:
3293       case PLUS_EXPR:
3294       case MINUS_EXPR:
3295       case BIT_IOR_EXPR:
3296       case BIT_XOR_EXPR:
3297         return build_zero_cst (scalar_type);
3298
3299       case MULT_EXPR:
3300         return build_one_cst (scalar_type);
3301
3302       case BIT_AND_EXPR:
3303         return build_all_ones_cst (scalar_type);
3304
3305       case MAX_EXPR:
3306       case MIN_EXPR:
3307         return initial_value;
3308
3309       default:
3310         return NULL_TREE;
3311       }
3312   else
3313     switch (combined_fn (code))
3314       {
3315       CASE_CFN_FMIN:
3316       CASE_CFN_FMAX:
3317         return initial_value;
3318
3319       default:
3320         return NULL_TREE;
3321       }
3322 }
3323
3324 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
3325    STMT is printed with a message MSG. */
3326
3327 static void
3328 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3329 {
3330   dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3331 }
3332
3333 /* Return true if we need an in-order reduction for operation CODE
3334    on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3335    overflow must wrap.  */
3336
3337 bool
3338 needs_fold_left_reduction_p (tree type, code_helper code)
3339 {
3340   /* CHECKME: check for !flag_finite_math_only too?  */
3341   if (SCALAR_FLOAT_TYPE_P (type))
3342     {
3343       if (code.is_tree_code ())
3344         switch (tree_code (code))
3345           {
3346           case MIN_EXPR:
3347           case MAX_EXPR:
3348             return false;
3349
3350           default:
3351             return !flag_associative_math;
3352           }
3353       else
3354         switch (combined_fn (code))
3355           {
3356           CASE_CFN_FMIN:
3357           CASE_CFN_FMAX:
3358             return false;
3359
3360           default:
3361             return !flag_associative_math;
3362           }
3363     }
3364
3365   if (INTEGRAL_TYPE_P (type))
3366     return (!code.is_tree_code ()
3367             || !operation_no_trapping_overflow (type, tree_code (code)));
3368
3369   if (SAT_FIXED_POINT_TYPE_P (type))
3370     return true;
3371
3372   return false;
3373 }
3374
3375 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3376    has a handled computation expression.  Store the main reduction
3377    operation in *CODE.  */
3378
3379 static bool
3380 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3381                       tree loop_arg, code_helper *code,
3382                       vec<std::pair<ssa_op_iter, use_operand_p> > &path)
3383 {
3384   auto_bitmap visited;
3385   tree lookfor = PHI_RESULT (phi);
3386   ssa_op_iter curri;
3387   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
3388   while (USE_FROM_PTR (curr) != loop_arg)
3389     curr = op_iter_next_use (&curri);
3390   curri.i = curri.numops;
3391   do
3392     {
3393       path.safe_push (std::make_pair (curri, curr));
3394       tree use = USE_FROM_PTR (curr);
3395       if (use == lookfor)
3396         break;
3397       gimple *def = SSA_NAME_DEF_STMT (use);
3398       if (gimple_nop_p (def)
3399           || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3400         {
3401 pop:
3402           do
3403             {
3404               std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3405               curri = x.first;
3406               curr = x.second;
3407               do
3408                 curr = op_iter_next_use (&curri);
3409               /* Skip already visited or non-SSA operands (from iterating
3410                  over PHI args).  */
3411               while (curr != NULL_USE_OPERAND_P
3412                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3413                          || ! bitmap_set_bit (visited,
3414                                               SSA_NAME_VERSION
3415                                                 (USE_FROM_PTR (curr)))));
3416             }
3417           while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
3418           if (curr == NULL_USE_OPERAND_P)
3419             break;
3420         }
3421       else
3422         {
3423           if (gimple_code (def) == GIMPLE_PHI)
3424             curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
3425           else
3426             curr = op_iter_init_use (&curri, def, SSA_OP_USE);
3427           while (curr != NULL_USE_OPERAND_P
3428                  && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3429                      || ! bitmap_set_bit (visited,
3430                                           SSA_NAME_VERSION
3431                                             (USE_FROM_PTR (curr)))))
3432             curr = op_iter_next_use (&curri);
3433           if (curr == NULL_USE_OPERAND_P)
3434             goto pop;
3435         }
3436     }
3437   while (1);
3438   if (dump_file && (dump_flags & TDF_DETAILS))
3439     {
3440       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
3441       unsigned i;
3442       std::pair<ssa_op_iter, use_operand_p> *x;
3443       FOR_EACH_VEC_ELT (path, i, x)
3444         dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
3445       dump_printf (MSG_NOTE, "\n");
3446     }
3447
3448   /* Check whether the reduction path detected is valid.  */
3449   bool fail = path.length () == 0;
3450   bool neg = false;
3451   int sign = -1;
3452   *code = ERROR_MARK;
3453   for (unsigned i = 1; i < path.length (); ++i)
3454     {
3455       gimple *use_stmt = USE_STMT (path[i].second);
3456       gimple_match_op op;
3457       if (!gimple_extract_op (use_stmt, &op))
3458         {
3459           fail = true;
3460           break;
3461         }
3462       unsigned int opi = op.num_ops;
3463       if (gassign *assign = dyn_cast<gassign *> (use_stmt))
3464         {
3465           /* The following make sure we can compute the operand index
3466              easily plus it mostly disallows chaining via COND_EXPR condition
3467              operands.  */
3468           for (opi = 0; opi < op.num_ops; ++opi)
3469             if (gimple_assign_rhs1_ptr (assign) + opi == path[i].second->use)
3470               break;
3471         }
3472       else if (gcall *call = dyn_cast<gcall *> (use_stmt))
3473         {
3474           for (opi = 0; opi < op.num_ops; ++opi)
3475             if (gimple_call_arg_ptr (call, opi) == path[i].second->use)
3476               break;
3477         }
3478       if (opi == op.num_ops)
3479         {
3480           fail = true;
3481           break;
3482         }
3483       op.code = canonicalize_code (op.code, op.type);
3484       if (op.code == MINUS_EXPR)
3485         {
3486           op.code = PLUS_EXPR;
3487           /* Track whether we negate the reduction value each iteration.  */
3488           if (op.ops[1] == op.ops[opi])
3489             neg = ! neg;
3490         }
3491       if (CONVERT_EXPR_CODE_P (op.code)
3492           && tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
3493         ;
3494       else if (*code == ERROR_MARK)
3495         {
3496           *code = op.code;
3497           sign = TYPE_SIGN (op.type);
3498         }
3499       else if (op.code != *code)
3500         {
3501           fail = true;
3502           break;
3503         }
3504       else if ((op.code == MIN_EXPR
3505                 || op.code == MAX_EXPR)
3506                && sign != TYPE_SIGN (op.type))
3507         {
3508           fail = true;
3509           break;
3510         }
3511       /* Check there's only a single stmt the op is used on.  For the
3512          not value-changing tail and the last stmt allow out-of-loop uses.
3513          ???  We could relax this and handle arbitrary live stmts by
3514          forcing a scalar epilogue for example.  */
3515       imm_use_iterator imm_iter;
3516       gimple *op_use_stmt;
3517       unsigned cnt = 0;
3518       FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op.ops[opi])
3519         if (!is_gimple_debug (op_use_stmt)
3520             && (*code != ERROR_MARK
3521                 || flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt))))
3522           {
3523             /* We want to allow x + x but not x < 1 ? x : 2.  */
3524             if (is_gimple_assign (op_use_stmt)
3525                 && gimple_assign_rhs_code (op_use_stmt) == COND_EXPR)
3526               {
3527                 use_operand_p use_p;
3528                 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
3529                   cnt++;
3530               }
3531             else
3532               cnt++;
3533           }
3534       if (cnt != 1)
3535         {
3536           fail = true;
3537           break;
3538         }
3539     }
3540   return ! fail && ! neg && *code != ERROR_MARK;
3541 }
3542
3543 bool
3544 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3545                       tree loop_arg, enum tree_code code)
3546 {
3547   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3548   code_helper code_;
3549   return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
3550           && code_ == code);
3551 }
3552
3553
3554
3555 /* Function vect_is_simple_reduction
3556
3557    (1) Detect a cross-iteration def-use cycle that represents a simple
3558    reduction computation.  We look for the following pattern:
3559
3560    loop_header:
3561      a1 = phi < a0, a2 >
3562      a3 = ...
3563      a2 = operation (a3, a1)
3564
3565    or
3566
3567    a3 = ...
3568    loop_header:
3569      a1 = phi < a0, a2 >
3570      a2 = operation (a3, a1)
3571
3572    such that:
3573    1. operation is commutative and associative and it is safe to
3574       change the order of the computation
3575    2. no uses for a2 in the loop (a2 is used out of the loop)
3576    3. no uses of a1 in the loop besides the reduction operation
3577    4. no uses of a1 outside the loop.
3578
3579    Conditions 1,4 are tested here.
3580    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
3581
3582    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
3583    nested cycles.
3584
3585    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
3586    reductions:
3587
3588      a1 = phi < a0, a2 >
3589      inner loop (def of a3)
3590      a2 = phi < a3 >
3591
3592    (4) Detect condition expressions, ie:
3593      for (int i = 0; i < N; i++)
3594        if (a[i] < val)
3595         ret_val = a[i];
3596
3597 */
3598
3599 static stmt_vec_info
3600 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3601                           bool *double_reduc, bool *reduc_chain_p)
3602 {
3603   gphi *phi = as_a <gphi *> (phi_info->stmt);
3604   gimple *phi_use_stmt = NULL;
3605   imm_use_iterator imm_iter;
3606   use_operand_p use_p;
3607
3608   *double_reduc = false;
3609   *reduc_chain_p = false;
3610   STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
3611
3612   tree phi_name = PHI_RESULT (phi);
3613   /* ???  If there are no uses of the PHI result the inner loop reduction
3614      won't be detected as possibly double-reduction by vectorizable_reduction
3615      because that tries to walk the PHI arg from the preheader edge which
3616      can be constant.  See PR60382.  */
3617   if (has_zero_uses (phi_name))
3618     return NULL;
3619   class loop *loop = (gimple_bb (phi))->loop_father;
3620   unsigned nphi_def_loop_uses = 0;
3621   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
3622     {
3623       gimple *use_stmt = USE_STMT (use_p);
3624       if (is_gimple_debug (use_stmt))
3625         continue;
3626
3627       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3628         {
3629           if (dump_enabled_p ())
3630             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3631                              "intermediate value used outside loop.\n");
3632
3633           return NULL;
3634         }
3635
3636       nphi_def_loop_uses++;
3637       phi_use_stmt = use_stmt;
3638     }
3639
3640   tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
3641   if (TREE_CODE (latch_def) != SSA_NAME)
3642     {
3643       if (dump_enabled_p ())
3644         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3645                          "reduction: not ssa_name: %T\n", latch_def);
3646       return NULL;
3647     }
3648
3649   stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
3650   if (!def_stmt_info
3651       || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
3652     return NULL;
3653
3654   bool nested_in_vect_loop
3655     = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
3656   unsigned nlatch_def_loop_uses = 0;
3657   auto_vec<gphi *, 3> lcphis;
3658   bool inner_loop_of_double_reduc = false;
3659   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
3660     {
3661       gimple *use_stmt = USE_STMT (use_p);
3662       if (is_gimple_debug (use_stmt))
3663         continue;
3664       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3665         nlatch_def_loop_uses++;
3666       else
3667         {
3668           /* We can have more than one loop-closed PHI.  */
3669           lcphis.safe_push (as_a <gphi *> (use_stmt));
3670           if (nested_in_vect_loop
3671               && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
3672                   == vect_double_reduction_def))
3673             inner_loop_of_double_reduc = true;
3674         }
3675     }
3676
3677   /* If we are vectorizing an inner reduction we are executing that
3678      in the original order only in case we are not dealing with a
3679      double reduction.  */
3680   if (nested_in_vect_loop && !inner_loop_of_double_reduc)
3681     {
3682       if (dump_enabled_p ())
3683         report_vect_op (MSG_NOTE, def_stmt_info->stmt,
3684                         "detected nested cycle: ");
3685       return def_stmt_info;
3686     }
3687
3688   /* When the inner loop of a double reduction ends up with more than
3689      one loop-closed PHI we have failed to classify alternate such
3690      PHIs as double reduction, leading to wrong code.  See PR103237.  */
3691   if (inner_loop_of_double_reduc && lcphis.length () != 1)
3692     {
3693       if (dump_enabled_p ())
3694         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3695                          "unhandle double reduction\n");
3696       return NULL;
3697     }
3698
3699   /* If this isn't a nested cycle or if the nested cycle reduction value
3700      is used ouside of the inner loop we cannot handle uses of the reduction
3701      value.  */
3702   if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
3703     {
3704       if (dump_enabled_p ())
3705         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3706                          "reduction used in loop.\n");
3707       return NULL;
3708     }
3709
3710   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3711      defined in the inner loop.  */
3712   if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
3713     {
3714       tree op1 = PHI_ARG_DEF (def_stmt, 0);
3715       if (gimple_phi_num_args (def_stmt) != 1
3716           || TREE_CODE (op1) != SSA_NAME)
3717         {
3718           if (dump_enabled_p ())
3719             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3720                              "unsupported phi node definition.\n");
3721
3722           return NULL;
3723         }
3724
3725       gimple *def1 = SSA_NAME_DEF_STMT (op1);
3726       if (gimple_bb (def1)
3727           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3728           && loop->inner
3729           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3730           && (is_gimple_assign (def1) || is_gimple_call (def1))
3731           && is_a <gphi *> (phi_use_stmt)
3732           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3733         {
3734           if (dump_enabled_p ())
3735             report_vect_op (MSG_NOTE, def_stmt,
3736                             "detected double reduction: ");
3737
3738           *double_reduc = true;
3739           return def_stmt_info;
3740         }
3741
3742       return NULL;
3743     }
3744
3745   /* Look for the expression computing latch_def from then loop PHI result.  */
3746   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3747   code_helper code;
3748   if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
3749                             path))
3750     {
3751       STMT_VINFO_REDUC_CODE (phi_info) = code;
3752       if (code == COND_EXPR && !nested_in_vect_loop)
3753         STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
3754
3755       /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
3756          reduction chain for which the additional restriction is that
3757          all operations in the chain are the same.  */
3758       auto_vec<stmt_vec_info, 8> reduc_chain;
3759       unsigned i;
3760       bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
3761       for (i = path.length () - 1; i >= 1; --i)
3762         {
3763           gimple *stmt = USE_STMT (path[i].second);
3764           stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
3765           gimple_match_op op;
3766           if (!gimple_extract_op (stmt, &op))
3767             gcc_unreachable ();
3768           if (gassign *assign = dyn_cast<gassign *> (stmt))
3769             STMT_VINFO_REDUC_IDX (stmt_info)
3770               = path[i].second->use - gimple_assign_rhs1_ptr (assign);
3771           else
3772             {
3773               gcall *call = as_a<gcall *> (stmt);
3774               STMT_VINFO_REDUC_IDX (stmt_info)
3775                 = path[i].second->use - gimple_call_arg_ptr (call, 0);
3776             }
3777           bool leading_conversion = (CONVERT_EXPR_CODE_P (op.code)
3778                                      && (i == 1 || i == path.length () - 1));
3779           if ((op.code != code && !leading_conversion)
3780               /* We can only handle the final value in epilogue
3781                  generation for reduction chains.  */
3782               || (i != 1 && !has_single_use (gimple_get_lhs (stmt))))
3783             is_slp_reduc = false;
3784           /* For reduction chains we support a trailing/leading
3785              conversions.  We do not store those in the actual chain.  */
3786           if (leading_conversion)
3787             continue;
3788           reduc_chain.safe_push (stmt_info);
3789         }
3790       if (is_slp_reduc && reduc_chain.length () > 1)
3791         {
3792           for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
3793             {
3794               REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
3795               REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
3796             }
3797           REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
3798           REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
3799
3800           /* Save the chain for further analysis in SLP detection.  */
3801           LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
3802           REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
3803
3804           *reduc_chain_p = true;
3805           if (dump_enabled_p ())
3806             dump_printf_loc (MSG_NOTE, vect_location,
3807                             "reduction: detected reduction chain\n");
3808         }
3809       else if (dump_enabled_p ())
3810         dump_printf_loc (MSG_NOTE, vect_location,
3811                          "reduction: detected reduction\n");
3812
3813       return def_stmt_info;
3814     }
3815
3816   if (dump_enabled_p ())
3817     dump_printf_loc (MSG_NOTE, vect_location,
3818                      "reduction: unknown pattern\n");
3819
3820   return NULL;
3821 }
3822
3823 /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
3824    PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
3825    or -1 if not known.  */
3826
3827 static int
3828 vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
3829 {
3830   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3831   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
3832     {
3833       if (dump_enabled_p ())
3834         dump_printf_loc (MSG_NOTE, vect_location,
3835                          "cost model: epilogue peel iters set to vf/2 "
3836                          "because loop iterations are unknown .\n");
3837       return assumed_vf / 2;
3838     }
3839   else
3840     {
3841       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3842       peel_iters_prologue = MIN (niters, peel_iters_prologue);
3843       int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3844       /* If we need to peel for gaps, but no peeling is required, we have to
3845          peel VF iterations.  */
3846       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
3847         peel_iters_epilogue = assumed_vf;
3848       return peel_iters_epilogue;
3849     }
3850 }
3851
3852 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
3853 int
3854 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3855                              int *peel_iters_epilogue,
3856                              stmt_vector_for_cost *scalar_cost_vec,
3857                              stmt_vector_for_cost *prologue_cost_vec,
3858                              stmt_vector_for_cost *epilogue_cost_vec)
3859 {
3860   int retval = 0;
3861
3862   *peel_iters_epilogue
3863     = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
3864
3865   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3866     {
3867       /* If peeled iterations are known but number of scalar loop
3868          iterations are unknown, count a taken branch per peeled loop.  */
3869       if (peel_iters_prologue > 0)
3870         retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3871                                    vect_prologue);
3872       if (*peel_iters_epilogue > 0)
3873         retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
3874                                     vect_epilogue);
3875     }
3876
3877   stmt_info_for_cost *si;
3878   int j;
3879   if (peel_iters_prologue)
3880     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3881       retval += record_stmt_cost (prologue_cost_vec,
3882                                   si->count * peel_iters_prologue,
3883                                   si->kind, si->stmt_info, si->misalign,
3884                                   vect_prologue);
3885   if (*peel_iters_epilogue)
3886     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3887       retval += record_stmt_cost (epilogue_cost_vec,
3888                                   si->count * *peel_iters_epilogue,
3889                                   si->kind, si->stmt_info, si->misalign,
3890                                   vect_epilogue);
3891
3892   return retval;
3893 }
3894
3895 /* Function vect_estimate_min_profitable_iters
3896
3897    Return the number of iterations required for the vector version of the
3898    loop to be profitable relative to the cost of the scalar version of the
3899    loop.
3900
3901    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3902    of iterations for vectorization.  -1 value means loop vectorization
3903    is not profitable.  This returned value may be used for dynamic
3904    profitability check.
3905
3906    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3907    for static check against estimated number of iterations.  */
3908
3909 static void
3910 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3911                                     int *ret_min_profitable_niters,
3912                                     int *ret_min_profitable_estimate,
3913                                     unsigned *suggested_unroll_factor)
3914 {
3915   int min_profitable_iters;
3916   int min_profitable_estimate;
3917   int peel_iters_prologue;
3918   int peel_iters_epilogue;
3919   unsigned vec_inside_cost = 0;
3920   int vec_outside_cost = 0;
3921   unsigned vec_prologue_cost = 0;
3922   unsigned vec_epilogue_cost = 0;
3923   int scalar_single_iter_cost = 0;
3924   int scalar_outside_cost = 0;
3925   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3926   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3927   vector_costs *target_cost_data = loop_vinfo->vector_costs;
3928
3929   /* Cost model disabled.  */
3930   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3931     {
3932       if (dump_enabled_p ())
3933         dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3934       *ret_min_profitable_niters = 0;
3935       *ret_min_profitable_estimate = 0;
3936       return;
3937     }
3938
3939   /* Requires loop versioning tests to handle misalignment.  */
3940   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3941     {
3942       /*  FIXME: Make cost depend on complexity of individual check.  */
3943       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3944       (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
3945       if (dump_enabled_p ())
3946         dump_printf (MSG_NOTE,
3947                      "cost model: Adding cost of checks for loop "
3948                      "versioning to treat misalignment.\n");
3949     }
3950
3951   /* Requires loop versioning with alias checks.  */
3952   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3953     {
3954       /*  FIXME: Make cost depend on complexity of individual check.  */
3955       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3956       (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
3957       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3958       if (len)
3959         /* Count LEN - 1 ANDs and LEN comparisons.  */
3960         (void) add_stmt_cost (target_cost_data, len * 2 - 1,
3961                               scalar_stmt, vect_prologue);
3962       len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3963       if (len)
3964         {
3965           /* Count LEN - 1 ANDs and LEN comparisons.  */
3966           unsigned int nstmts = len * 2 - 1;
3967           /* +1 for each bias that needs adding.  */
3968           for (unsigned int i = 0; i < len; ++i)
3969             if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3970               nstmts += 1;
3971           (void) add_stmt_cost (target_cost_data, nstmts,
3972                                 scalar_stmt, vect_prologue);
3973         }
3974       if (dump_enabled_p ())
3975         dump_printf (MSG_NOTE,
3976                      "cost model: Adding cost of checks for loop "
3977                      "versioning aliasing.\n");
3978     }
3979
3980   /* Requires loop versioning with niter checks.  */
3981   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3982     {
3983       /*  FIXME: Make cost depend on complexity of individual check.  */
3984       (void) add_stmt_cost (target_cost_data, 1, vector_stmt,
3985                             NULL, NULL, NULL_TREE, 0, vect_prologue);
3986       if (dump_enabled_p ())
3987         dump_printf (MSG_NOTE,
3988                      "cost model: Adding cost of checks for loop "
3989                      "versioning niters.\n");
3990     }
3991
3992   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3993     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3994                           vect_prologue);
3995
3996   /* Count statements in scalar loop.  Using this as scalar cost for a single
3997      iteration for now.
3998
3999      TODO: Add outer loop support.
4000
4001      TODO: Consider assigning different costs to different scalar
4002      statements.  */
4003
4004   scalar_single_iter_cost = loop_vinfo->scalar_costs->total_cost ();
4005
4006   /* Add additional cost for the peeled instructions in prologue and epilogue
4007      loop.  (For fully-masked loops there will be no peeling.)
4008
4009      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
4010      at compile-time - we assume it's vf/2 (the worst would be vf-1).
4011
4012      TODO: Build an expression that represents peel_iters for prologue and
4013      epilogue to be used in a run-time test.  */
4014
4015   bool prologue_need_br_taken_cost = false;
4016   bool prologue_need_br_not_taken_cost = false;
4017
4018   /* Calculate peel_iters_prologue.  */
4019   if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
4020     peel_iters_prologue = 0;
4021   else if (npeel < 0)
4022     {
4023       peel_iters_prologue = assumed_vf / 2;
4024       if (dump_enabled_p ())
4025         dump_printf (MSG_NOTE, "cost model: "
4026                      "prologue peel iters set to vf/2.\n");
4027
4028       /* If peeled iterations are unknown, count a taken branch and a not taken
4029          branch per peeled loop.  Even if scalar loop iterations are known,
4030          vector iterations are not known since peeled prologue iterations are
4031          not known.  Hence guards remain the same.  */
4032       prologue_need_br_taken_cost = true;
4033       prologue_need_br_not_taken_cost = true;
4034     }
4035   else
4036     {
4037       peel_iters_prologue = npeel;
4038       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
4039         /* If peeled iterations are known but number of scalar loop
4040            iterations are unknown, count a taken branch per peeled loop.  */
4041         prologue_need_br_taken_cost = true;
4042     }
4043
4044   bool epilogue_need_br_taken_cost = false;
4045   bool epilogue_need_br_not_taken_cost = false;
4046
4047   /* Calculate peel_iters_epilogue.  */
4048   if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4049     /* We need to peel exactly one iteration for gaps.  */
4050     peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
4051   else if (npeel < 0)
4052     {
4053       /* If peeling for alignment is unknown, loop bound of main loop
4054          becomes unknown.  */
4055       peel_iters_epilogue = assumed_vf / 2;
4056       if (dump_enabled_p ())
4057         dump_printf (MSG_NOTE, "cost model: "
4058                      "epilogue peel iters set to vf/2 because "
4059                      "peeling for alignment is unknown.\n");
4060
4061       /* See the same reason above in peel_iters_prologue calculation.  */
4062       epilogue_need_br_taken_cost = true;
4063       epilogue_need_br_not_taken_cost = true;
4064     }
4065   else
4066     {
4067       peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
4068       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
4069         /* If peeled iterations are known but number of scalar loop
4070            iterations are unknown, count a taken branch per peeled loop.  */
4071         epilogue_need_br_taken_cost = true;
4072     }
4073
4074   stmt_info_for_cost *si;
4075   int j;
4076   /* Add costs associated with peel_iters_prologue.  */
4077   if (peel_iters_prologue)
4078     FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4079       {
4080         (void) add_stmt_cost (target_cost_data,
4081                               si->count * peel_iters_prologue, si->kind,
4082                               si->stmt_info, si->node, si->vectype,
4083                               si->misalign, vect_prologue);
4084       }
4085
4086   /* Add costs associated with peel_iters_epilogue.  */
4087   if (peel_iters_epilogue)
4088     FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4089       {
4090         (void) add_stmt_cost (target_cost_data,
4091                               si->count * peel_iters_epilogue, si->kind,
4092                               si->stmt_info, si->node, si->vectype,
4093                               si->misalign, vect_epilogue);
4094       }
4095
4096   /* Add possible cond_branch_taken/cond_branch_not_taken cost.  */
4097
4098   if (prologue_need_br_taken_cost)
4099     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4100                           vect_prologue);
4101
4102   if (prologue_need_br_not_taken_cost)
4103     (void) add_stmt_cost (target_cost_data, 1,
4104                           cond_branch_not_taken, vect_prologue);
4105
4106   if (epilogue_need_br_taken_cost)
4107     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4108                           vect_epilogue);
4109
4110   if (epilogue_need_br_not_taken_cost)
4111     (void) add_stmt_cost (target_cost_data, 1,
4112                           cond_branch_not_taken, vect_epilogue);
4113
4114   /* Take care of special costs for rgroup controls of partial vectors.  */
4115   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
4116     {
4117       /* Calculate how many masks we need to generate.  */
4118       unsigned int num_masks = 0;
4119       rgroup_controls *rgm;
4120       unsigned int num_vectors_m1;
4121       FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), num_vectors_m1, rgm)
4122         if (rgm->type)
4123           num_masks += num_vectors_m1 + 1;
4124       gcc_assert (num_masks > 0);
4125
4126       /* In the worst case, we need to generate each mask in the prologue
4127          and in the loop body.  One of the loop body mask instructions
4128          replaces the comparison in the scalar loop, and since we don't
4129          count the scalar comparison against the scalar body, we shouldn't
4130          count that vector instruction against the vector body either.
4131
4132          Sometimes we can use unpacks instead of generating prologue
4133          masks and sometimes the prologue mask will fold to a constant,
4134          so the actual prologue cost might be smaller.  However, it's
4135          simpler and safer to use the worst-case cost; if this ends up
4136          being the tie-breaker between vectorizing or not, then it's
4137          probably better not to vectorize.  */
4138       (void) add_stmt_cost (target_cost_data, num_masks,
4139                             vector_stmt, NULL, NULL, NULL_TREE, 0,
4140                             vect_prologue);
4141       (void) add_stmt_cost (target_cost_data, num_masks - 1,
4142                             vector_stmt, NULL, NULL, NULL_TREE, 0,
4143                             vect_body);
4144     }
4145   else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
4146     {
4147       /* Referring to the functions vect_set_loop_condition_partial_vectors
4148          and vect_set_loop_controls_directly, we need to generate each
4149          length in the prologue and in the loop body if required. Although
4150          there are some possible optimizations, we consider the worst case
4151          here.  */
4152
4153       bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
4154       signed char partial_load_store_bias
4155         = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
4156       bool need_iterate_p
4157         = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
4158            && !vect_known_niters_smaller_than_vf (loop_vinfo));
4159
4160       /* Calculate how many statements to be added.  */
4161       unsigned int prologue_stmts = 0;
4162       unsigned int body_stmts = 0;
4163
4164       rgroup_controls *rgc;
4165       unsigned int num_vectors_m1;
4166       FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
4167         if (rgc->type)
4168           {
4169             /* May need one SHIFT for nitems_total computation.  */
4170             unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
4171             if (nitems != 1 && !niters_known_p)
4172               prologue_stmts += 1;
4173
4174             /* May need one MAX and one MINUS for wrap around.  */
4175             if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
4176               prologue_stmts += 2;
4177
4178             /* Need one MAX and one MINUS for each batch limit excepting for
4179                the 1st one.  */
4180             prologue_stmts += num_vectors_m1 * 2;
4181
4182             unsigned int num_vectors = num_vectors_m1 + 1;
4183
4184             /* Need to set up lengths in prologue, only one MIN required
4185                for each since start index is zero.  */
4186             prologue_stmts += num_vectors;
4187
4188             /* If we have a non-zero partial load bias, we need one PLUS
4189                to adjust the load length.  */
4190             if (partial_load_store_bias != 0)
4191               body_stmts += 1;
4192
4193             /* Each may need two MINs and one MINUS to update lengths in body
4194                for next iteration.  */
4195             if (need_iterate_p)
4196               body_stmts += 3 * num_vectors;
4197           }
4198
4199       (void) add_stmt_cost (target_cost_data, prologue_stmts,
4200                             scalar_stmt, vect_prologue);
4201       (void) add_stmt_cost (target_cost_data, body_stmts,
4202                             scalar_stmt, vect_body);
4203     }
4204
4205   /* FORNOW: The scalar outside cost is incremented in one of the
4206      following ways:
4207
4208      1. The vectorizer checks for alignment and aliasing and generates
4209      a condition that allows dynamic vectorization.  A cost model
4210      check is ANDED with the versioning condition.  Hence scalar code
4211      path now has the added cost of the versioning check.
4212
4213        if (cost > th & versioning_check)
4214          jmp to vector code
4215
4216      Hence run-time scalar is incremented by not-taken branch cost.
4217
4218      2. The vectorizer then checks if a prologue is required.  If the
4219      cost model check was not done before during versioning, it has to
4220      be done before the prologue check.
4221
4222        if (cost <= th)
4223          prologue = scalar_iters
4224        if (prologue == 0)
4225          jmp to vector code
4226        else
4227          execute prologue
4228        if (prologue == num_iters)
4229          go to exit
4230
4231      Hence the run-time scalar cost is incremented by a taken branch,
4232      plus a not-taken branch, plus a taken branch cost.
4233
4234      3. The vectorizer then checks if an epilogue is required.  If the
4235      cost model check was not done before during prologue check, it
4236      has to be done with the epilogue check.
4237
4238        if (prologue == 0)
4239          jmp to vector code
4240        else
4241          execute prologue
4242        if (prologue == num_iters)
4243          go to exit
4244        vector code:
4245          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
4246            jmp to epilogue
4247
4248      Hence the run-time scalar cost should be incremented by 2 taken
4249      branches.
4250
4251      TODO: The back end may reorder the BBS's differently and reverse
4252      conditions/branch directions.  Change the estimates below to
4253      something more reasonable.  */
4254
4255   /* If the number of iterations is known and we do not do versioning, we can
4256      decide whether to vectorize at compile time.  Hence the scalar version
4257      do not carry cost model guard costs.  */
4258   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4259       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
4260     {
4261       /* Cost model check occurs at versioning.  */
4262       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4263         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
4264       else
4265         {
4266           /* Cost model check occurs at prologue generation.  */
4267           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
4268             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
4269               + vect_get_stmt_cost (cond_branch_not_taken);
4270           /* Cost model check occurs at epilogue generation.  */
4271           else
4272             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
4273         }
4274     }
4275
4276   /* Complete the target-specific cost calculations.  */
4277   finish_cost (loop_vinfo->vector_costs, loop_vinfo->scalar_costs,
4278                &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost,
4279                suggested_unroll_factor);
4280
4281   if (suggested_unroll_factor && *suggested_unroll_factor > 1
4282       && LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) != MAX_VECTORIZATION_FACTOR
4283       && !known_le (LOOP_VINFO_VECT_FACTOR (loop_vinfo) *
4284                     *suggested_unroll_factor,
4285                     LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo)))
4286     {
4287       if (dump_enabled_p ())
4288         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4289                          "can't unroll as unrolled vectorization factor larger"
4290                          " than maximum vectorization factor: %d\n",
4291                          LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo));
4292       *suggested_unroll_factor = 1;
4293     }
4294
4295   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
4296
4297   if (dump_enabled_p ())
4298     {
4299       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
4300       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
4301                    vec_inside_cost);
4302       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
4303                    vec_prologue_cost);
4304       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
4305                    vec_epilogue_cost);
4306       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
4307                    scalar_single_iter_cost);
4308       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
4309                    scalar_outside_cost);
4310       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
4311                    vec_outside_cost);
4312       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
4313                    peel_iters_prologue);
4314       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
4315                    peel_iters_epilogue);
4316     }
4317
4318   /* Calculate number of iterations required to make the vector version
4319      profitable, relative to the loop bodies only.  The following condition
4320      must hold true:
4321      SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
4322      where
4323      SIC = scalar iteration cost, VIC = vector iteration cost,
4324      VOC = vector outside cost, VF = vectorization factor,
4325      NPEEL = prologue iterations + epilogue iterations,
4326      SOC = scalar outside cost for run time cost model check.  */
4327
4328   int saving_per_viter = (scalar_single_iter_cost * assumed_vf
4329                           - vec_inside_cost);
4330   if (saving_per_viter <= 0)
4331     {
4332       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
4333         warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
4334                     "vectorization did not happen for a simd loop");
4335
4336       if (dump_enabled_p ())
4337         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4338                          "cost model: the vector iteration cost = %d "
4339                          "divided by the scalar iteration cost = %d "
4340                          "is greater or equal to the vectorization factor = %d"
4341                          ".\n",
4342                          vec_inside_cost, scalar_single_iter_cost, assumed_vf);
4343       *ret_min_profitable_niters = -1;
4344       *ret_min_profitable_estimate = -1;
4345       return;
4346     }
4347
4348   /* ??? The "if" arm is written to handle all cases; see below for what
4349      we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
4350   if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4351     {
4352       /* Rewriting the condition above in terms of the number of
4353          vector iterations (vniters) rather than the number of
4354          scalar iterations (niters) gives:
4355
4356          SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
4357
4358          <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
4359
4360          For integer N, X and Y when X > 0:
4361
4362          N * X > Y <==> N >= (Y /[floor] X) + 1.  */
4363       int outside_overhead = (vec_outside_cost
4364                               - scalar_single_iter_cost * peel_iters_prologue
4365                               - scalar_single_iter_cost * peel_iters_epilogue
4366                               - scalar_outside_cost);
4367       /* We're only interested in cases that require at least one
4368          vector iteration.  */
4369       int min_vec_niters = 1;
4370       if (outside_overhead > 0)
4371         min_vec_niters = outside_overhead / saving_per_viter + 1;
4372
4373       if (dump_enabled_p ())
4374         dump_printf (MSG_NOTE, "  Minimum number of vector iterations: %d\n",
4375                      min_vec_niters);
4376
4377       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4378         {
4379           /* Now that we know the minimum number of vector iterations,
4380              find the minimum niters for which the scalar cost is larger:
4381
4382              SIC * niters > VIC * vniters + VOC - SOC
4383
4384              We know that the minimum niters is no more than
4385              vniters * VF + NPEEL, but it might be (and often is) less
4386              than that if a partial vector iteration is cheaper than the
4387              equivalent scalar code.  */
4388           int threshold = (vec_inside_cost * min_vec_niters
4389                            + vec_outside_cost
4390                            - scalar_outside_cost);
4391           if (threshold <= 0)
4392             min_profitable_iters = 1;
4393           else
4394             min_profitable_iters = threshold / scalar_single_iter_cost + 1;
4395         }
4396       else
4397         /* Convert the number of vector iterations into a number of
4398            scalar iterations.  */
4399         min_profitable_iters = (min_vec_niters * assumed_vf
4400                                 + peel_iters_prologue
4401                                 + peel_iters_epilogue);
4402     }
4403   else
4404     {
4405       min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
4406                               * assumed_vf
4407                               - vec_inside_cost * peel_iters_prologue
4408                               - vec_inside_cost * peel_iters_epilogue);
4409       if (min_profitable_iters <= 0)
4410         min_profitable_iters = 0;
4411       else
4412         {
4413           min_profitable_iters /= saving_per_viter;
4414
4415           if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
4416               <= (((int) vec_inside_cost * min_profitable_iters)
4417                   + (((int) vec_outside_cost - scalar_outside_cost)
4418                      * assumed_vf)))
4419             min_profitable_iters++;
4420         }
4421     }
4422
4423   if (dump_enabled_p ())
4424     dump_printf (MSG_NOTE,
4425                  "  Calculated minimum iters for profitability: %d\n",
4426                  min_profitable_iters);
4427
4428   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
4429       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
4430     /* We want the vectorized loop to execute at least once.  */
4431     min_profitable_iters = assumed_vf + peel_iters_prologue;
4432   else if (min_profitable_iters < peel_iters_prologue)
4433     /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
4434        vectorized loop executes at least once.  */
4435     min_profitable_iters = peel_iters_prologue;
4436
4437   if (dump_enabled_p ())
4438     dump_printf_loc (MSG_NOTE, vect_location,
4439                      "  Runtime profitability threshold = %d\n",
4440                      min_profitable_iters);
4441
4442   *ret_min_profitable_niters = min_profitable_iters;
4443
4444   /* Calculate number of iterations required to make the vector version
4445      profitable, relative to the loop bodies only.
4446
4447      Non-vectorized variant is SIC * niters and it must win over vector
4448      variant on the expected loop trip count.  The following condition must hold true:
4449      SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC  */
4450
4451   if (vec_outside_cost <= 0)
4452     min_profitable_estimate = 0;
4453   /* ??? This "else if" arm is written to handle all cases; see below for
4454      what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
4455   else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4456     {
4457       /* This is a repeat of the code above, but with + SOC rather
4458          than - SOC.  */
4459       int outside_overhead = (vec_outside_cost
4460                               - scalar_single_iter_cost * peel_iters_prologue
4461                               - scalar_single_iter_cost * peel_iters_epilogue
4462                               + scalar_outside_cost);
4463       int min_vec_niters = 1;
4464       if (outside_overhead > 0)
4465         min_vec_niters = outside_overhead / saving_per_viter + 1;
4466
4467       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4468         {
4469           int threshold = (vec_inside_cost * min_vec_niters
4470                            + vec_outside_cost
4471                            + scalar_outside_cost);
4472           min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
4473         }
4474       else
4475         min_profitable_estimate = (min_vec_niters * assumed_vf
4476                                    + peel_iters_prologue
4477                                    + peel_iters_epilogue);
4478     }
4479   else
4480     {
4481       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
4482                                  * assumed_vf
4483                                  - vec_inside_cost * peel_iters_prologue
4484                                  - vec_inside_cost * peel_iters_epilogue)
4485                                  / ((scalar_single_iter_cost * assumed_vf)
4486                                    - vec_inside_cost);
4487     }
4488   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
4489   if (dump_enabled_p ())
4490     dump_printf_loc (MSG_NOTE, vect_location,
4491                      "  Static estimate profitability threshold = %d\n",
4492                      min_profitable_estimate);
4493
4494   *ret_min_profitable_estimate = min_profitable_estimate;
4495 }
4496
4497 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
4498    vector elements (not bits) for a vector with NELT elements.  */
4499 static void
4500 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
4501                               vec_perm_builder *sel)
4502 {
4503   /* The encoding is a single stepped pattern.  Any wrap-around is handled
4504      by vec_perm_indices.  */
4505   sel->new_vector (nelt, 1, 3);
4506   for (unsigned int i = 0; i < 3; i++)
4507     sel->quick_push (i + offset);
4508 }
4509
4510 /* Checks whether the target supports whole-vector shifts for vectors of mode
4511    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
4512    it supports vec_perm_const with masks for all necessary shift amounts.  */
4513 static bool
4514 have_whole_vector_shift (machine_mode mode)
4515 {
4516   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
4517     return true;
4518
4519   /* Variable-length vectors should be handled via the optab.  */
4520   unsigned int nelt;
4521   if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
4522     return false;
4523
4524   vec_perm_builder sel;
4525   vec_perm_indices indices;
4526   for (unsigned int i = nelt / 2; i >= 1; i /= 2)
4527     {
4528       calc_vec_perm_mask_for_shift (i, nelt, &sel);
4529       indices.new_vector (sel, 2, nelt);
4530       if (!can_vec_perm_const_p (mode, indices, false))
4531         return false;
4532     }
4533   return true;
4534 }
4535
4536 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
4537    functions. Design better to avoid maintenance issues.  */
4538
4539 /* Function vect_model_reduction_cost.
4540
4541    Models cost for a reduction operation, including the vector ops
4542    generated within the strip-mine loop in some cases, the initial
4543    definition before the loop, and the epilogue code that must be generated.  */
4544
4545 static void
4546 vect_model_reduction_cost (loop_vec_info loop_vinfo,
4547                            stmt_vec_info stmt_info, internal_fn reduc_fn,
4548                            vect_reduction_type reduction_type,
4549                            int ncopies, stmt_vector_for_cost *cost_vec)
4550 {
4551   int prologue_cost = 0, epilogue_cost = 0, inside_cost = 0;
4552   tree vectype;
4553   machine_mode mode;
4554   class loop *loop = NULL;
4555
4556   if (loop_vinfo)
4557     loop = LOOP_VINFO_LOOP (loop_vinfo);
4558
4559   /* Condition reductions generate two reductions in the loop.  */
4560   if (reduction_type == COND_REDUCTION)
4561     ncopies *= 2;
4562
4563   vectype = STMT_VINFO_VECTYPE (stmt_info);
4564   mode = TYPE_MODE (vectype);
4565   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4566
4567   gimple_match_op op;
4568   if (!gimple_extract_op (orig_stmt_info->stmt, &op))
4569     gcc_unreachable ();
4570
4571   if (reduction_type == EXTRACT_LAST_REDUCTION)
4572     /* No extra instructions are needed in the prologue.  The loop body
4573        operations are costed in vectorizable_condition.  */
4574     inside_cost = 0;
4575   else if (reduction_type == FOLD_LEFT_REDUCTION)
4576     {
4577       /* No extra instructions needed in the prologue.  */
4578       prologue_cost = 0;
4579
4580       if (reduc_fn != IFN_LAST)
4581         /* Count one reduction-like operation per vector.  */
4582         inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
4583                                         stmt_info, 0, vect_body);
4584       else
4585         {
4586           /* Use NELEMENTS extracts and NELEMENTS scalar ops.  */
4587           unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
4588           inside_cost = record_stmt_cost (cost_vec, nelements,
4589                                           vec_to_scalar, stmt_info, 0,
4590                                           vect_body);
4591           inside_cost += record_stmt_cost (cost_vec, nelements,
4592                                            scalar_stmt, stmt_info, 0,
4593                                            vect_body);
4594         }
4595     }
4596   else
4597     {
4598       /* Add in cost for initial definition.
4599          For cond reduction we have four vectors: initial index, step,
4600          initial result of the data reduction, initial value of the index
4601          reduction.  */
4602       int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
4603       prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
4604                                          scalar_to_vec, stmt_info, 0,
4605                                          vect_prologue);
4606     }
4607
4608   /* Determine cost of epilogue code.
4609
4610      We have a reduction operator that will reduce the vector in one statement.
4611      Also requires scalar extract.  */
4612
4613   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
4614     {
4615       if (reduc_fn != IFN_LAST)
4616         {
4617           if (reduction_type == COND_REDUCTION)
4618             {
4619               /* An EQ stmt and an COND_EXPR stmt.  */
4620               epilogue_cost += record_stmt_cost (cost_vec, 2,
4621                                                  vector_stmt, stmt_info, 0,
4622                                                  vect_epilogue);
4623               /* Reduction of the max index and a reduction of the found
4624                  values.  */
4625               epilogue_cost += record_stmt_cost (cost_vec, 2,
4626                                                  vec_to_scalar, stmt_info, 0,
4627                                                  vect_epilogue);
4628               /* A broadcast of the max value.  */
4629               epilogue_cost += record_stmt_cost (cost_vec, 1,
4630                                                  scalar_to_vec, stmt_info, 0,
4631                                                  vect_epilogue);
4632             }
4633           else
4634             {
4635               epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
4636                                                  stmt_info, 0, vect_epilogue);
4637               epilogue_cost += record_stmt_cost (cost_vec, 1,
4638                                                  vec_to_scalar, stmt_info, 0,
4639                                                  vect_epilogue);
4640             }
4641         }
4642       else if (reduction_type == COND_REDUCTION)
4643         {
4644           unsigned estimated_nunits = vect_nunits_for_cost (vectype);
4645           /* Extraction of scalar elements.  */
4646           epilogue_cost += record_stmt_cost (cost_vec,
4647                                              2 * estimated_nunits,
4648                                              vec_to_scalar, stmt_info, 0,
4649                                              vect_epilogue);
4650           /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
4651           epilogue_cost += record_stmt_cost (cost_vec,
4652                                              2 * estimated_nunits - 3,
4653                                              scalar_stmt, stmt_info, 0,
4654                                              vect_epilogue);
4655         }
4656       else if (reduction_type == EXTRACT_LAST_REDUCTION
4657                || reduction_type == FOLD_LEFT_REDUCTION)
4658         /* No extra instructions need in the epilogue.  */
4659         ;
4660       else
4661         {
4662           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4663           tree bitsize = TYPE_SIZE (op.type);
4664           int element_bitsize = tree_to_uhwi (bitsize);
4665           int nelements = vec_size_in_bits / element_bitsize;
4666
4667           if (op.code == COND_EXPR)
4668             op.code = MAX_EXPR;
4669
4670           /* We have a whole vector shift available.  */
4671           if (VECTOR_MODE_P (mode)
4672               && directly_supported_p (op.code, vectype)
4673               && have_whole_vector_shift (mode))
4674             {
4675               /* Final reduction via vector shifts and the reduction operator.
4676                  Also requires scalar extract.  */
4677               epilogue_cost += record_stmt_cost (cost_vec,
4678                                                  exact_log2 (nelements) * 2,
4679                                                  vector_stmt, stmt_info, 0,
4680                                                  vect_epilogue);
4681               epilogue_cost += record_stmt_cost (cost_vec, 1,
4682                                                  vec_to_scalar, stmt_info, 0,
4683                                                  vect_epilogue);
4684             }
4685           else
4686             /* Use extracts and reduction op for final reduction.  For N
4687                elements, we have N extracts and N-1 reduction ops.  */
4688             epilogue_cost += record_stmt_cost (cost_vec,
4689                                                nelements + nelements - 1,
4690                                                vector_stmt, stmt_info, 0,
4691                                                vect_epilogue);
4692         }
4693     }
4694
4695   if (dump_enabled_p ())
4696     dump_printf (MSG_NOTE,
4697                  "vect_model_reduction_cost: inside_cost = %d, "
4698                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4699                  prologue_cost, epilogue_cost);
4700 }
4701
4702 /* SEQ is a sequence of instructions that initialize the reduction
4703    described by REDUC_INFO.  Emit them in the appropriate place.  */
4704
4705 static void
4706 vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo,
4707                                 stmt_vec_info reduc_info, gimple *seq)
4708 {
4709   if (reduc_info->reused_accumulator)
4710     {
4711       /* When reusing an accumulator from the main loop, we only need
4712          initialization instructions if the main loop can be skipped.
4713          In that case, emit the initialization instructions at the end
4714          of the guard block that does the skip.  */
4715       edge skip_edge = loop_vinfo->skip_main_loop_edge;
4716       gcc_assert (skip_edge);
4717       gimple_stmt_iterator gsi = gsi_last_bb (skip_edge->src);
4718       gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
4719     }
4720   else
4721     {
4722       /* The normal case: emit the initialization instructions on the
4723          preheader edge.  */
4724       class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4725       gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), seq);
4726     }
4727 }
4728
4729 /* Function get_initial_def_for_reduction
4730
4731    Input:
4732    REDUC_INFO - the info_for_reduction
4733    INIT_VAL - the initial value of the reduction variable
4734    NEUTRAL_OP - a value that has no effect on the reduction, as per
4735                 neutral_op_for_reduction
4736
4737    Output:
4738    Return a vector variable, initialized according to the operation that
4739         STMT_VINFO performs. This vector will be used as the initial value
4740         of the vector of partial results.
4741
4742    The value we need is a vector in which element 0 has value INIT_VAL
4743    and every other element has value NEUTRAL_OP.  */
4744
4745 static tree
4746 get_initial_def_for_reduction (loop_vec_info loop_vinfo,
4747                                stmt_vec_info reduc_info,
4748                                tree init_val, tree neutral_op)
4749 {
4750   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4751   tree scalar_type = TREE_TYPE (init_val);
4752   tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
4753   tree init_def;
4754   gimple_seq stmts = NULL;
4755
4756   gcc_assert (vectype);
4757
4758   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4759               || SCALAR_FLOAT_TYPE_P (scalar_type));
4760
4761   gcc_assert (nested_in_vect_loop_p (loop, reduc_info)
4762               || loop == (gimple_bb (reduc_info->stmt))->loop_father);
4763
4764   if (operand_equal_p (init_val, neutral_op))
4765     {
4766       /* If both elements are equal then the vector described above is
4767          just a splat.  */
4768       neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
4769       init_def = gimple_build_vector_from_val (&stmts, vectype, neutral_op);
4770     }
4771   else
4772     {
4773       neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
4774       init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4775       if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4776         {
4777           /* Construct a splat of NEUTRAL_OP and insert INIT_VAL into
4778              element 0.  */
4779           init_def = gimple_build_vector_from_val (&stmts, vectype,
4780                                                    neutral_op);
4781           init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4782                                    vectype, init_def, init_val);
4783         }
4784       else
4785         {
4786           /* Build {INIT_VAL, NEUTRAL_OP, NEUTRAL_OP, ...}.  */
4787           tree_vector_builder elts (vectype, 1, 2);
4788           elts.quick_push (init_val);
4789           elts.quick_push (neutral_op);
4790           init_def = gimple_build_vector (&stmts, &elts);
4791         }
4792     }
4793
4794   if (stmts)
4795     vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, stmts);
4796   return init_def;
4797 }
4798
4799 /* Get at the initial defs for the reduction PHIs for REDUC_INFO,
4800    which performs a reduction involving GROUP_SIZE scalar statements.
4801    NUMBER_OF_VECTORS is the number of vector defs to create.  If NEUTRAL_OP
4802    is nonnull, introducing extra elements of that value will not change the
4803    result.  */
4804
4805 static void
4806 get_initial_defs_for_reduction (loop_vec_info loop_vinfo,
4807                                 stmt_vec_info reduc_info,
4808                                 vec<tree> *vec_oprnds,
4809                                 unsigned int number_of_vectors,
4810                                 unsigned int group_size, tree neutral_op)
4811 {
4812   vec<tree> &initial_values = reduc_info->reduc_initial_values;
4813   unsigned HOST_WIDE_INT nunits;
4814   unsigned j, number_of_places_left_in_vector;
4815   tree vector_type = STMT_VINFO_VECTYPE (reduc_info);
4816   unsigned int i;
4817
4818   gcc_assert (group_size == initial_values.length () || neutral_op);
4819
4820   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4821      created vectors. It is greater than 1 if unrolling is performed.
4822
4823      For example, we have two scalar operands, s1 and s2 (e.g., group of
4824      strided accesses of size two), while NUNITS is four (i.e., four scalars
4825      of this type can be packed in a vector).  The output vector will contain
4826      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
4827      will be 2).
4828
4829      If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4830      vectors containing the operands.
4831
4832      For example, NUNITS is four as before, and the group size is 8
4833      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
4834      {s5, s6, s7, s8}.  */
4835
4836   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4837     nunits = group_size;
4838
4839   number_of_places_left_in_vector = nunits;
4840   bool constant_p = true;
4841   tree_vector_builder elts (vector_type, nunits, 1);
4842   elts.quick_grow (nunits);
4843   gimple_seq ctor_seq = NULL;
4844   for (j = 0; j < nunits * number_of_vectors; ++j)
4845     {
4846       tree op;
4847       i = j % group_size;
4848
4849       /* Get the def before the loop.  In reduction chain we have only
4850          one initial value.  Else we have as many as PHIs in the group.  */
4851       if (i >= initial_values.length () || (j > i && neutral_op))
4852         op = neutral_op;
4853       else
4854         op = initial_values[i];
4855
4856       /* Create 'vect_ = {op0,op1,...,opn}'.  */
4857       number_of_places_left_in_vector--;
4858       elts[nunits - number_of_places_left_in_vector - 1] = op;
4859       if (!CONSTANT_CLASS_P (op))
4860         constant_p = false;
4861
4862       if (number_of_places_left_in_vector == 0)
4863         {
4864           tree init;
4865           if (constant_p && !neutral_op
4866               ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4867               : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4868             /* Build the vector directly from ELTS.  */
4869             init = gimple_build_vector (&ctor_seq, &elts);
4870           else if (neutral_op)
4871             {
4872               /* Build a vector of the neutral value and shift the
4873                  other elements into place.  */
4874               init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4875                                                    neutral_op);
4876               int k = nunits;
4877               while (k > 0 && elts[k - 1] == neutral_op)
4878                 k -= 1;
4879               while (k > 0)
4880                 {
4881                   k -= 1;
4882                   init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4883                                        vector_type, init, elts[k]);
4884                 }
4885             }
4886           else
4887             {
4888               /* First time round, duplicate ELTS to fill the
4889                  required number of vectors.  */
4890               duplicate_and_interleave (loop_vinfo, &ctor_seq, vector_type,
4891                                         elts, number_of_vectors, *vec_oprnds);
4892               break;
4893             }
4894           vec_oprnds->quick_push (init);
4895
4896           number_of_places_left_in_vector = nunits;
4897           elts.new_vector (vector_type, nunits, 1);
4898           elts.quick_grow (nunits);
4899           constant_p = true;
4900         }
4901     }
4902   if (ctor_seq != NULL)
4903     vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, ctor_seq);
4904 }
4905
4906 /* For a statement STMT_INFO taking part in a reduction operation return
4907    the stmt_vec_info the meta information is stored on.  */
4908
4909 stmt_vec_info
4910 info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
4911 {
4912   stmt_info = vect_orig_stmt (stmt_info);
4913   gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
4914   if (!is_a <gphi *> (stmt_info->stmt)
4915       || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
4916     stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4917   gphi *phi = as_a <gphi *> (stmt_info->stmt);
4918   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
4919     {
4920       if (gimple_phi_num_args (phi) == 1)
4921         stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4922     }
4923   else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
4924     {
4925       stmt_vec_info info = vinfo->lookup_def (vect_phi_initial_value (phi));
4926       if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
4927         stmt_info = info;
4928     }
4929   return stmt_info;
4930 }
4931
4932 /* See if LOOP_VINFO is an epilogue loop whose main loop had a reduction that
4933    REDUC_INFO can build on.  Adjust REDUC_INFO and return true if so, otherwise
4934    return false.  */
4935
4936 static bool
4937 vect_find_reusable_accumulator (loop_vec_info loop_vinfo,
4938                                 stmt_vec_info reduc_info)
4939 {
4940   loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
4941   if (!main_loop_vinfo)
4942     return false;
4943
4944   if (STMT_VINFO_REDUC_TYPE (reduc_info) != TREE_CODE_REDUCTION)
4945     return false;
4946
4947   unsigned int num_phis = reduc_info->reduc_initial_values.length ();
4948   auto_vec<tree, 16> main_loop_results (num_phis);
4949   auto_vec<tree, 16> initial_values (num_phis);
4950   if (edge main_loop_edge = loop_vinfo->main_loop_edge)
4951     {
4952       /* The epilogue loop can be entered either from the main loop or
4953          from an earlier guard block.  */
4954       edge skip_edge = loop_vinfo->skip_main_loop_edge;
4955       for (tree incoming_value : reduc_info->reduc_initial_values)
4956         {
4957           /* Look for:
4958
4959                INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop),
4960                                     INITIAL_VALUE(guard block)>.  */
4961           gcc_assert (TREE_CODE (incoming_value) == SSA_NAME);
4962
4963           gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value));
4964           gcc_assert (gimple_bb (phi) == main_loop_edge->dest);
4965
4966           tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge);
4967           tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge);
4968
4969           main_loop_results.quick_push (from_main_loop);
4970           initial_values.quick_push (from_skip);
4971         }
4972     }
4973   else
4974     /* The main loop dominates the epilogue loop.  */
4975     main_loop_results.splice (reduc_info->reduc_initial_values);
4976
4977   /* See if the main loop has the kind of accumulator we need.  */
4978   vect_reusable_accumulator *accumulator
4979     = main_loop_vinfo->reusable_accumulators.get (main_loop_results[0]);
4980   if (!accumulator
4981       || num_phis != accumulator->reduc_info->reduc_scalar_results.length ()
4982       || !std::equal (main_loop_results.begin (), main_loop_results.end (),
4983                       accumulator->reduc_info->reduc_scalar_results.begin ()))
4984     return false;
4985
4986   /* Handle the case where we can reduce wider vectors to narrower ones.  */
4987   tree vectype = STMT_VINFO_VECTYPE (reduc_info);
4988   tree old_vectype = TREE_TYPE (accumulator->reduc_input);
4989   unsigned HOST_WIDE_INT m;
4990   if (!constant_multiple_p (TYPE_VECTOR_SUBPARTS (old_vectype),
4991                             TYPE_VECTOR_SUBPARTS (vectype), &m))
4992     return false;
4993   /* Check the intermediate vector types and operations are available.  */
4994   tree prev_vectype = old_vectype;
4995   poly_uint64 intermediate_nunits = TYPE_VECTOR_SUBPARTS (old_vectype);
4996   while (known_gt (intermediate_nunits, TYPE_VECTOR_SUBPARTS (vectype)))
4997     {
4998       intermediate_nunits = exact_div (intermediate_nunits, 2);
4999       tree intermediate_vectype = get_related_vectype_for_scalar_type
5000         (TYPE_MODE (vectype), TREE_TYPE (vectype), intermediate_nunits);
5001       if (!intermediate_vectype
5002           || !directly_supported_p (STMT_VINFO_REDUC_CODE (reduc_info),
5003                                     intermediate_vectype)
5004           || !can_vec_extract (TYPE_MODE (prev_vectype),
5005                                TYPE_MODE (intermediate_vectype)))
5006         return false;
5007       prev_vectype = intermediate_vectype;
5008     }
5009
5010   /* Non-SLP reductions might apply an adjustment after the reduction
5011      operation, in order to simplify the initialization of the accumulator.
5012      If the epilogue loop carries on from where the main loop left off,
5013      it should apply the same adjustment to the final reduction result.
5014
5015      If the epilogue loop can also be entered directly (rather than via
5016      the main loop), we need to be able to handle that case in the same way,
5017      with the same adjustment.  (In principle we could add a PHI node
5018      to select the correct adjustment, but in practice that shouldn't be
5019      necessary.)  */
5020   tree main_adjustment
5021     = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (accumulator->reduc_info);
5022   if (loop_vinfo->main_loop_edge && main_adjustment)
5023     {
5024       gcc_assert (num_phis == 1);
5025       tree initial_value = initial_values[0];
5026       /* Check that we can use INITIAL_VALUE as the adjustment and
5027          initialize the accumulator with a neutral value instead.  */
5028       if (!operand_equal_p (initial_value, main_adjustment))
5029         return false;
5030       code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5031       initial_values[0] = neutral_op_for_reduction (TREE_TYPE (initial_value),
5032                                                     code, initial_value);
5033     }
5034   STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = main_adjustment;
5035   reduc_info->reduc_initial_values.truncate (0);
5036   reduc_info->reduc_initial_values.splice (initial_values);
5037   reduc_info->reused_accumulator = accumulator;
5038   return true;
5039 }
5040
5041 /* Reduce the vector VEC_DEF down to VECTYPE with reduction operation
5042    CODE emitting stmts before GSI.  Returns a vector def of VECTYPE.  */
5043
5044 static tree
5045 vect_create_partial_epilog (tree vec_def, tree vectype, code_helper code,
5046                             gimple_seq *seq)
5047 {
5048   unsigned nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (vec_def)).to_constant ();
5049   unsigned nunits1 = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5050   tree stype = TREE_TYPE (vectype);
5051   tree new_temp = vec_def;
5052   while (nunits > nunits1)
5053     {
5054       nunits /= 2;
5055       tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5056                                                            stype, nunits);
5057       unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5058
5059       /* The target has to make sure we support lowpart/highpart
5060          extraction, either via direct vector extract or through
5061          an integer mode punning.  */
5062       tree dst1, dst2;
5063       gimple *epilog_stmt;
5064       if (convert_optab_handler (vec_extract_optab,
5065                                  TYPE_MODE (TREE_TYPE (new_temp)),
5066                                  TYPE_MODE (vectype1))
5067           != CODE_FOR_nothing)
5068         {
5069           /* Extract sub-vectors directly once vec_extract becomes
5070              a conversion optab.  */
5071           dst1 = make_ssa_name (vectype1);
5072           epilog_stmt
5073               = gimple_build_assign (dst1, BIT_FIELD_REF,
5074                                      build3 (BIT_FIELD_REF, vectype1,
5075                                              new_temp, TYPE_SIZE (vectype1),
5076                                              bitsize_int (0)));
5077           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5078           dst2 =  make_ssa_name (vectype1);
5079           epilog_stmt
5080               = gimple_build_assign (dst2, BIT_FIELD_REF,
5081                                      build3 (BIT_FIELD_REF, vectype1,
5082                                              new_temp, TYPE_SIZE (vectype1),
5083                                              bitsize_int (bitsize)));
5084           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5085         }
5086       else
5087         {
5088           /* Extract via punning to appropriately sized integer mode
5089              vector.  */
5090           tree eltype = build_nonstandard_integer_type (bitsize, 1);
5091           tree etype = build_vector_type (eltype, 2);
5092           gcc_assert (convert_optab_handler (vec_extract_optab,
5093                                              TYPE_MODE (etype),
5094                                              TYPE_MODE (eltype))
5095                       != CODE_FOR_nothing);
5096           tree tem = make_ssa_name (etype);
5097           epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5098                                              build1 (VIEW_CONVERT_EXPR,
5099                                                      etype, new_temp));
5100           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5101           new_temp = tem;
5102           tem = make_ssa_name (eltype);
5103           epilog_stmt
5104               = gimple_build_assign (tem, BIT_FIELD_REF,
5105                                      build3 (BIT_FIELD_REF, eltype,
5106                                              new_temp, TYPE_SIZE (eltype),
5107                                              bitsize_int (0)));
5108           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5109           dst1 = make_ssa_name (vectype1);
5110           epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5111                                              build1 (VIEW_CONVERT_EXPR,
5112                                                      vectype1, tem));
5113           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5114           tem = make_ssa_name (eltype);
5115           epilog_stmt
5116               = gimple_build_assign (tem, BIT_FIELD_REF,
5117                                      build3 (BIT_FIELD_REF, eltype,
5118                                              new_temp, TYPE_SIZE (eltype),
5119                                              bitsize_int (bitsize)));
5120           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5121           dst2 =  make_ssa_name (vectype1);
5122           epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5123                                              build1 (VIEW_CONVERT_EXPR,
5124                                                      vectype1, tem));
5125           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5126         }
5127
5128       new_temp = gimple_build (seq, code, vectype1, dst1, dst2);
5129     }
5130
5131   return new_temp;
5132 }
5133
5134 /* Function vect_create_epilog_for_reduction
5135
5136    Create code at the loop-epilog to finalize the result of a reduction
5137    computation.
5138
5139    STMT_INFO is the scalar reduction stmt that is being vectorized.
5140    SLP_NODE is an SLP node containing a group of reduction statements. The
5141      first one in this group is STMT_INFO.
5142    SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
5143    REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
5144      (counting from 0)
5145
5146    This function:
5147    1. Completes the reduction def-use cycles.
5148    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
5149       by calling the function specified by REDUC_FN if available, or by
5150       other means (whole-vector shifts or a scalar loop).
5151       The function also creates a new phi node at the loop exit to preserve
5152       loop-closed form, as illustrated below.
5153
5154      The flow at the entry to this function:
5155
5156         loop:
5157           vec_def = phi <vec_init, null>        # REDUCTION_PHI
5158           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
5159           s_loop = scalar_stmt                  # (scalar) STMT_INFO
5160         loop_exit:
5161           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5162           use <s_out0>
5163           use <s_out0>
5164
5165      The above is transformed by this function into:
5166
5167         loop:
5168           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
5169           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
5170           s_loop = scalar_stmt                  # (scalar) STMT_INFO
5171         loop_exit:
5172           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5173           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5174           v_out2 = reduce <v_out1>
5175           s_out3 = extract_field <v_out2, 0>
5176           s_out4 = adjust_result <s_out3>
5177           use <s_out4>
5178           use <s_out4>
5179 */
5180
5181 static void
5182 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
5183                                   stmt_vec_info stmt_info,
5184                                   slp_tree slp_node,
5185                                   slp_instance slp_node_instance)
5186 {
5187   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5188   gcc_assert (reduc_info->is_reduc_info);
5189   /* For double reductions we need to get at the inner loop reduction
5190      stmt which has the meta info attached.  Our stmt_info is that of the
5191      loop-closed PHI of the inner loop which we remember as
5192      def for the reduction PHI generation.  */
5193   bool double_reduc = false;
5194   stmt_vec_info rdef_info = stmt_info;
5195   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5196     {
5197       gcc_assert (!slp_node);
5198       double_reduc = true;
5199       stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
5200                                             (stmt_info->stmt, 0));
5201       stmt_info = vect_stmt_to_vectorize (stmt_info);
5202     }
5203   gphi *reduc_def_stmt
5204     = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
5205   code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5206   internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
5207   tree vectype;
5208   machine_mode mode;
5209   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
5210   basic_block exit_bb;
5211   tree scalar_dest;
5212   tree scalar_type;
5213   gimple *new_phi = NULL, *phi;
5214   gimple_stmt_iterator exit_gsi;
5215   tree new_temp = NULL_TREE, new_name, new_scalar_dest;
5216   gimple *epilog_stmt = NULL;
5217   gimple *exit_phi;
5218   tree bitsize;
5219   tree def;
5220   tree orig_name, scalar_result;
5221   imm_use_iterator imm_iter, phi_imm_iter;
5222   use_operand_p use_p, phi_use_p;
5223   gimple *use_stmt;
5224   auto_vec<tree> reduc_inputs;
5225   int j, i;
5226   vec<tree> &scalar_results = reduc_info->reduc_scalar_results;
5227   unsigned int group_size = 1, k;
5228   auto_vec<gimple *> phis;
5229   /* SLP reduction without reduction chain, e.g.,
5230      # a1 = phi <a2, a0>
5231      # b1 = phi <b2, b0>
5232      a2 = operation (a1)
5233      b2 = operation (b1)  */
5234   bool slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
5235   bool direct_slp_reduc;
5236   tree induction_index = NULL_TREE;
5237
5238   if (slp_node)
5239     group_size = SLP_TREE_LANES (slp_node);
5240
5241   if (nested_in_vect_loop_p (loop, stmt_info))
5242     {
5243       outer_loop = loop;
5244       loop = loop->inner;
5245       gcc_assert (!slp_node && double_reduc);
5246     }
5247
5248   vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
5249   gcc_assert (vectype);
5250   mode = TYPE_MODE (vectype);
5251
5252   tree induc_val = NULL_TREE;
5253   tree adjustment_def = NULL;
5254   if (slp_node)
5255     ;
5256   else
5257     {
5258       /* Optimize: for induction condition reduction, if we can't use zero
5259          for induc_val, use initial_def.  */
5260       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5261         induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
5262       else if (double_reduc)
5263         ;
5264       else
5265         adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
5266     }
5267
5268   stmt_vec_info single_live_out_stmt[] = { stmt_info };
5269   array_slice<const stmt_vec_info> live_out_stmts = single_live_out_stmt;
5270   if (slp_reduc)
5271     /* All statements produce live-out values.  */
5272     live_out_stmts = SLP_TREE_SCALAR_STMTS (slp_node);
5273   else if (slp_node)
5274     /* The last statement in the reduction chain produces the live-out
5275        value.  */
5276     single_live_out_stmt[0] = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5277
5278   unsigned vec_num;
5279   int ncopies;
5280   if (slp_node)
5281     {
5282       vec_num = SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis).length ();
5283       ncopies = 1;
5284     }
5285   else
5286     {
5287       stmt_vec_info reduc_info = loop_vinfo->lookup_stmt (reduc_def_stmt);
5288       vec_num = 1;
5289       ncopies = STMT_VINFO_VEC_STMTS (reduc_info).length ();
5290     }
5291
5292   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
5293      which is updated with the current index of the loop for every match of
5294      the original loop's cond_expr (VEC_STMT).  This results in a vector
5295      containing the last time the condition passed for that vector lane.
5296      The first match will be a 1 to allow 0 to be used for non-matching
5297      indexes.  If there are no matches at all then the vector will be all
5298      zeroes.
5299
5300      PR92772: This algorithm is broken for architectures that support
5301      masked vectors, but do not provide fold_extract_last.  */
5302   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
5303     {
5304       auto_vec<std::pair<tree, bool>, 2> ccompares;
5305       stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
5306       cond_info = vect_stmt_to_vectorize (cond_info);
5307       while (cond_info != reduc_info)
5308         {
5309           if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
5310             {
5311               gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
5312               gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
5313               ccompares.safe_push
5314                 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
5315                                  STMT_VINFO_REDUC_IDX (cond_info) == 2));
5316             }
5317           cond_info
5318             = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
5319                                                  1 + STMT_VINFO_REDUC_IDX
5320                                                         (cond_info)));
5321           cond_info = vect_stmt_to_vectorize (cond_info);
5322         }
5323       gcc_assert (ccompares.length () != 0);
5324
5325       tree indx_before_incr, indx_after_incr;
5326       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
5327       int scalar_precision
5328         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
5329       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
5330       tree cr_index_vector_type = get_related_vectype_for_scalar_type
5331         (TYPE_MODE (vectype), cr_index_scalar_type,
5332          TYPE_VECTOR_SUBPARTS (vectype));
5333
5334       /* First we create a simple vector induction variable which starts
5335          with the values {1,2,3,...} (SERIES_VECT) and increments by the
5336          vector size (STEP).  */
5337
5338       /* Create a {1,2,3,...} vector.  */
5339       tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
5340
5341       /* Create a vector of the step value.  */
5342       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
5343       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
5344
5345       /* Create an induction variable.  */
5346       gimple_stmt_iterator incr_gsi;
5347       bool insert_after;
5348       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
5349       create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
5350                  insert_after, &indx_before_incr, &indx_after_incr);
5351
5352       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
5353          filled with zeros (VEC_ZERO).  */
5354
5355       /* Create a vector of 0s.  */
5356       tree zero = build_zero_cst (cr_index_scalar_type);
5357       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
5358
5359       /* Create a vector phi node.  */
5360       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
5361       new_phi = create_phi_node (new_phi_tree, loop->header);
5362       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
5363                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
5364
5365       /* Now take the condition from the loops original cond_exprs
5366          and produce a new cond_exprs (INDEX_COND_EXPR) which for
5367          every match uses values from the induction variable
5368          (INDEX_BEFORE_INCR) otherwise uses values from the phi node
5369          (NEW_PHI_TREE).
5370          Finally, we update the phi (NEW_PHI_TREE) to take the value of
5371          the new cond_expr (INDEX_COND_EXPR).  */
5372       gimple_seq stmts = NULL;
5373       for (int i = ccompares.length () - 1; i != -1; --i)
5374         {
5375           tree ccompare = ccompares[i].first;
5376           if (ccompares[i].second)
5377             new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5378                                          cr_index_vector_type,
5379                                          ccompare,
5380                                          indx_before_incr, new_phi_tree);
5381           else
5382             new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5383                                          cr_index_vector_type,
5384                                          ccompare,
5385                                          new_phi_tree, indx_before_incr);
5386         }
5387       gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
5388
5389       /* Update the phi with the vec cond.  */
5390       induction_index = new_phi_tree;
5391       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
5392                    loop_latch_edge (loop), UNKNOWN_LOCATION);
5393     }
5394
5395   /* 2. Create epilog code.
5396         The reduction epilog code operates across the elements of the vector
5397         of partial results computed by the vectorized loop.
5398         The reduction epilog code consists of:
5399
5400         step 1: compute the scalar result in a vector (v_out2)
5401         step 2: extract the scalar result (s_out3) from the vector (v_out2)
5402         step 3: adjust the scalar result (s_out3) if needed.
5403
5404         Step 1 can be accomplished using one the following three schemes:
5405           (scheme 1) using reduc_fn, if available.
5406           (scheme 2) using whole-vector shifts, if available.
5407           (scheme 3) using a scalar loop. In this case steps 1+2 above are
5408                      combined.
5409
5410           The overall epilog code looks like this:
5411
5412           s_out0 = phi <s_loop>         # original EXIT_PHI
5413           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
5414           v_out2 = reduce <v_out1>              # step 1
5415           s_out3 = extract_field <v_out2, 0>    # step 2
5416           s_out4 = adjust_result <s_out3>       # step 3
5417
5418           (step 3 is optional, and steps 1 and 2 may be combined).
5419           Lastly, the uses of s_out0 are replaced by s_out4.  */
5420
5421
5422   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
5423          v_out1 = phi <VECT_DEF>
5424          Store them in NEW_PHIS.  */
5425   if (double_reduc)
5426     loop = outer_loop;
5427   exit_bb = single_exit (loop)->dest;
5428   exit_gsi = gsi_after_labels (exit_bb);
5429   reduc_inputs.create (slp_node ? vec_num : ncopies);
5430   for (unsigned i = 0; i < vec_num; i++)
5431     {
5432       gimple_seq stmts = NULL;
5433       if (slp_node)
5434         def = vect_get_slp_vect_def (slp_node, i);
5435       else
5436         def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]);
5437       for (j = 0; j < ncopies; j++)
5438         {
5439           tree new_def = copy_ssa_name (def);
5440           phi = create_phi_node (new_def, exit_bb);
5441           if (j)
5442             def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
5443           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
5444           new_def = gimple_convert (&stmts, vectype, new_def);
5445           reduc_inputs.quick_push (new_def);
5446         }
5447       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5448     }
5449
5450   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
5451          (i.e. when reduc_fn is not available) and in the final adjustment
5452          code (if needed).  Also get the original scalar reduction variable as
5453          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
5454          represents a reduction pattern), the tree-code and scalar-def are
5455          taken from the original stmt that the pattern-stmt (STMT) replaces.
5456          Otherwise (it is a regular reduction) - the tree-code and scalar-def
5457          are taken from STMT.  */
5458
5459   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
5460   if (orig_stmt_info != stmt_info)
5461     {
5462       /* Reduction pattern  */
5463       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
5464       gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
5465     }
5466
5467   scalar_dest = gimple_get_lhs (orig_stmt_info->stmt);
5468   scalar_type = TREE_TYPE (scalar_dest);
5469   scalar_results.create (group_size);
5470   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
5471   bitsize = TYPE_SIZE (scalar_type);
5472
5473   /* True if we should implement SLP_REDUC using native reduction operations
5474      instead of scalar operations.  */
5475   direct_slp_reduc = (reduc_fn != IFN_LAST
5476                       && slp_reduc
5477                       && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
5478
5479   /* In case of reduction chain, e.g.,
5480      # a1 = phi <a3, a0>
5481      a2 = operation (a1)
5482      a3 = operation (a2),
5483
5484      we may end up with more than one vector result.  Here we reduce them
5485      to one vector.
5486
5487      The same is true if we couldn't use a single defuse cycle.  */
5488   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)
5489       || direct_slp_reduc
5490       || ncopies > 1)
5491     {
5492       gimple_seq stmts = NULL;
5493       tree single_input = reduc_inputs[0];
5494       for (k = 1; k < reduc_inputs.length (); k++)
5495         single_input = gimple_build (&stmts, code, vectype,
5496                                      single_input, reduc_inputs[k]);
5497       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5498
5499       reduc_inputs.truncate (0);
5500       reduc_inputs.safe_push (single_input);
5501     }
5502
5503   tree orig_reduc_input = reduc_inputs[0];
5504
5505   /* If this loop is an epilogue loop that can be skipped after the
5506      main loop, we can only share a reduction operation between the
5507      main loop and the epilogue if we put it at the target of the
5508      skip edge.
5509
5510      We can still reuse accumulators if this check fails.  Doing so has
5511      the minor(?) benefit of making the epilogue loop's scalar result
5512      independent of the main loop's scalar result.  */
5513   bool unify_with_main_loop_p = false;
5514   if (reduc_info->reused_accumulator
5515       && loop_vinfo->skip_this_loop_edge
5516       && single_succ_p (exit_bb)
5517       && single_succ (exit_bb) == loop_vinfo->skip_this_loop_edge->dest)
5518     {
5519       unify_with_main_loop_p = true;
5520
5521       basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest;
5522       reduc_inputs[0] = make_ssa_name (vectype);
5523       gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block);
5524       add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (exit_bb),
5525                    UNKNOWN_LOCATION);
5526       add_phi_arg (new_phi, reduc_info->reused_accumulator->reduc_input,
5527                    loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION);
5528       exit_gsi = gsi_after_labels (reduc_block);
5529     }
5530
5531   /* Shouldn't be used beyond this point.  */
5532   exit_bb = nullptr;
5533
5534   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
5535       && reduc_fn != IFN_LAST)
5536     {
5537       /* For condition reductions, we have a vector (REDUC_INPUTS 0) containing
5538          various data values where the condition matched and another vector
5539          (INDUCTION_INDEX) containing all the indexes of those matches.  We
5540          need to extract the last matching index (which will be the index with
5541          highest value) and use this to index into the data vector.
5542          For the case where there were no matches, the data vector will contain
5543          all default values and the index vector will be all zeros.  */
5544
5545       /* Get various versions of the type of the vector of indexes.  */
5546       tree index_vec_type = TREE_TYPE (induction_index);
5547       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
5548       tree index_scalar_type = TREE_TYPE (index_vec_type);
5549       tree index_vec_cmp_type = truth_type_for (index_vec_type);
5550
5551       /* Get an unsigned integer version of the type of the data vector.  */
5552       int scalar_precision
5553         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
5554       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
5555       tree vectype_unsigned = get_same_sized_vectype (scalar_type_unsigned,
5556                                                 vectype);
5557
5558       /* First we need to create a vector (ZERO_VEC) of zeros and another
5559          vector (MAX_INDEX_VEC) filled with the last matching index, which we
5560          can create using a MAX reduction and then expanding.
5561          In the case where the loop never made any matches, the max index will
5562          be zero.  */
5563
5564       /* Vector of {0, 0, 0,...}.  */
5565       tree zero_vec = build_zero_cst (vectype);
5566
5567       /* Find maximum value from the vector of found indexes.  */
5568       tree max_index = make_ssa_name (index_scalar_type);
5569       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5570                                                           1, induction_index);
5571       gimple_call_set_lhs (max_index_stmt, max_index);
5572       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
5573
5574       /* Vector of {max_index, max_index, max_index,...}.  */
5575       tree max_index_vec = make_ssa_name (index_vec_type);
5576       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
5577                                                       max_index);
5578       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
5579                                                         max_index_vec_rhs);
5580       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
5581
5582       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
5583          with the vector (INDUCTION_INDEX) of found indexes, choosing values
5584          from the data vector (REDUC_INPUTS 0) for matches, 0 (ZERO_VEC)
5585          otherwise.  Only one value should match, resulting in a vector
5586          (VEC_COND) with one data value and the rest zeros.
5587          In the case where the loop never made any matches, every index will
5588          match, resulting in a vector with all data values (which will all be
5589          the default value).  */
5590
5591       /* Compare the max index vector to the vector of found indexes to find
5592          the position of the max value.  */
5593       tree vec_compare = make_ssa_name (index_vec_cmp_type);
5594       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
5595                                                       induction_index,
5596                                                       max_index_vec);
5597       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
5598
5599       /* Use the compare to choose either values from the data vector or
5600          zero.  */
5601       tree vec_cond = make_ssa_name (vectype);
5602       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
5603                                                    vec_compare,
5604                                                    reduc_inputs[0],
5605                                                    zero_vec);
5606       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
5607
5608       /* Finally we need to extract the data value from the vector (VEC_COND)
5609          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
5610          reduction, but because this doesn't exist, we can use a MAX reduction
5611          instead.  The data value might be signed or a float so we need to cast
5612          it first.
5613          In the case where the loop never made any matches, the data values are
5614          all identical, and so will reduce down correctly.  */
5615
5616       /* Make the matched data values unsigned.  */
5617       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
5618       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
5619                                        vec_cond);
5620       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
5621                                                         VIEW_CONVERT_EXPR,
5622                                                         vec_cond_cast_rhs);
5623       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
5624
5625       /* Reduce down to a scalar value.  */
5626       tree data_reduc = make_ssa_name (scalar_type_unsigned);
5627       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5628                                                            1, vec_cond_cast);
5629       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
5630       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
5631
5632       /* Convert the reduced value back to the result type and set as the
5633          result.  */
5634       gimple_seq stmts = NULL;
5635       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
5636                                data_reduc);
5637       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5638       scalar_results.safe_push (new_temp);
5639     }
5640   else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
5641            && reduc_fn == IFN_LAST)
5642     {
5643       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
5644          idx = 0;
5645          idx_val = induction_index[0];
5646          val = data_reduc[0];
5647          for (idx = 0, val = init, i = 0; i < nelts; ++i)
5648            if (induction_index[i] > idx_val)
5649              val = data_reduc[i], idx_val = induction_index[i];
5650          return val;  */
5651
5652       tree data_eltype = TREE_TYPE (vectype);
5653       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5654       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5655       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5656       /* Enforced by vectorizable_reduction, which ensures we have target
5657          support before allowing a conditional reduction on variable-length
5658          vectors.  */
5659       unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5660       tree idx_val = NULL_TREE, val = NULL_TREE;
5661       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5662         {
5663           tree old_idx_val = idx_val;
5664           tree old_val = val;
5665           idx_val = make_ssa_name (idx_eltype);
5666           epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5667                                              build3 (BIT_FIELD_REF, idx_eltype,
5668                                                      induction_index,
5669                                                      bitsize_int (el_size),
5670                                                      bitsize_int (off)));
5671           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5672           val = make_ssa_name (data_eltype);
5673           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5674                                              build3 (BIT_FIELD_REF,
5675                                                      data_eltype,
5676                                                      reduc_inputs[0],
5677                                                      bitsize_int (el_size),
5678                                                      bitsize_int (off)));
5679           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5680           if (off != 0)
5681             {
5682               tree new_idx_val = idx_val;
5683               if (off != v_size - el_size)
5684                 {
5685                   new_idx_val = make_ssa_name (idx_eltype);
5686                   epilog_stmt = gimple_build_assign (new_idx_val,
5687                                                      MAX_EXPR, idx_val,
5688                                                      old_idx_val);
5689                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5690                 }
5691               tree new_val = make_ssa_name (data_eltype);
5692               epilog_stmt = gimple_build_assign (new_val,
5693                                                  COND_EXPR,
5694                                                  build2 (GT_EXPR,
5695                                                          boolean_type_node,
5696                                                          idx_val,
5697                                                          old_idx_val),
5698                                                  val, old_val);
5699               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5700               idx_val = new_idx_val;
5701               val = new_val;
5702             }
5703         }
5704       /* Convert the reduced value back to the result type and set as the
5705          result.  */
5706       gimple_seq stmts = NULL;
5707       val = gimple_convert (&stmts, scalar_type, val);
5708       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5709       scalar_results.safe_push (val);
5710     }
5711
5712   /* 2.3 Create the reduction code, using one of the three schemes described
5713          above. In SLP we simply need to extract all the elements from the
5714          vector (without reducing them), so we use scalar shifts.  */
5715   else if (reduc_fn != IFN_LAST && !slp_reduc)
5716     {
5717       tree tmp;
5718       tree vec_elem_type;
5719
5720       /* Case 1:  Create:
5721          v_out2 = reduc_expr <v_out1>  */
5722
5723       if (dump_enabled_p ())
5724         dump_printf_loc (MSG_NOTE, vect_location,
5725                          "Reduce using direct vector reduction.\n");
5726
5727       gimple_seq stmts = NULL;
5728       vec_elem_type = TREE_TYPE (vectype);
5729       new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
5730                                vec_elem_type, reduc_inputs[0]);
5731       new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5732       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5733
5734       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5735           && induc_val)
5736         {
5737           /* Earlier we set the initial value to be a vector if induc_val
5738              values.  Check the result and if it is induc_val then replace
5739              with the original initial value, unless induc_val is
5740              the same as initial_def already.  */
5741           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5742                                   induc_val);
5743           tree initial_def = reduc_info->reduc_initial_values[0];
5744
5745           tmp = make_ssa_name (new_scalar_dest);
5746           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5747                                              initial_def, new_temp);
5748           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5749           new_temp = tmp;
5750         }
5751
5752       scalar_results.safe_push (new_temp);
5753     }
5754   else if (direct_slp_reduc)
5755     {
5756       /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5757          with the elements for other SLP statements replaced with the
5758          neutral value.  We can then do a normal reduction on each vector.  */
5759
5760       /* Enforced by vectorizable_reduction.  */
5761       gcc_assert (reduc_inputs.length () == 1);
5762       gcc_assert (pow2p_hwi (group_size));
5763
5764       gimple_seq seq = NULL;
5765
5766       /* Build a vector {0, 1, 2, ...}, with the same number of elements
5767          and the same element size as VECTYPE.  */
5768       tree index = build_index_vector (vectype, 0, 1);
5769       tree index_type = TREE_TYPE (index);
5770       tree index_elt_type = TREE_TYPE (index_type);
5771       tree mask_type = truth_type_for (index_type);
5772
5773       /* Create a vector that, for each element, identifies which of
5774          the REDUC_GROUP_SIZE results should use it.  */
5775       tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5776       index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5777                             build_vector_from_val (index_type, index_mask));
5778
5779       /* Get a neutral vector value.  This is simply a splat of the neutral
5780          scalar value if we have one, otherwise the initial scalar value
5781          is itself a neutral value.  */
5782       tree vector_identity = NULL_TREE;
5783       tree neutral_op = NULL_TREE;
5784       if (slp_node)
5785         {
5786           tree initial_value = NULL_TREE;
5787           if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5788             initial_value = reduc_info->reduc_initial_values[0];
5789           neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
5790                                                  initial_value);
5791         }
5792       if (neutral_op)
5793         vector_identity = gimple_build_vector_from_val (&seq, vectype,
5794                                                         neutral_op);
5795       for (unsigned int i = 0; i < group_size; ++i)
5796         {
5797           /* If there's no univeral neutral value, we can use the
5798              initial scalar value from the original PHI.  This is used
5799              for MIN and MAX reduction, for example.  */
5800           if (!neutral_op)
5801             {
5802               tree scalar_value = reduc_info->reduc_initial_values[i];
5803               scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
5804                                              scalar_value);
5805               vector_identity = gimple_build_vector_from_val (&seq, vectype,
5806                                                               scalar_value);
5807             }
5808
5809           /* Calculate the equivalent of:
5810
5811              sel[j] = (index[j] == i);
5812
5813              which selects the elements of REDUC_INPUTS[0] that should
5814              be included in the result.  */
5815           tree compare_val = build_int_cst (index_elt_type, i);
5816           compare_val = build_vector_from_val (index_type, compare_val);
5817           tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5818                                    index, compare_val);
5819
5820           /* Calculate the equivalent of:
5821
5822              vec = seq ? reduc_inputs[0] : vector_identity;
5823
5824              VEC is now suitable for a full vector reduction.  */
5825           tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5826                                    sel, reduc_inputs[0], vector_identity);
5827
5828           /* Do the reduction and convert it to the appropriate type.  */
5829           tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5830                                       TREE_TYPE (vectype), vec);
5831           scalar = gimple_convert (&seq, scalar_type, scalar);
5832           scalar_results.safe_push (scalar);
5833         }
5834       gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5835     }
5836   else
5837     {
5838       bool reduce_with_shift;
5839       tree vec_temp;
5840
5841       gcc_assert (slp_reduc || reduc_inputs.length () == 1);
5842
5843       /* See if the target wants to do the final (shift) reduction
5844          in a vector mode of smaller size and first reduce upper/lower
5845          halves against each other.  */
5846       enum machine_mode mode1 = mode;
5847       tree stype = TREE_TYPE (vectype);
5848       unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5849       unsigned nunits1 = nunits;
5850       if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
5851           && reduc_inputs.length () == 1)
5852         {
5853           nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5854           /* For SLP reductions we have to make sure lanes match up, but
5855              since we're doing individual element final reduction reducing
5856              vector width here is even more important.
5857              ???  We can also separate lanes with permutes, for the common
5858              case of power-of-two group-size odd/even extracts would work.  */
5859           if (slp_reduc && nunits != nunits1)
5860             {
5861               nunits1 = least_common_multiple (nunits1, group_size);
5862               gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
5863             }
5864         }
5865       if (!slp_reduc
5866           && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5867         nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5868
5869       tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5870                                                            stype, nunits1);
5871       reduce_with_shift = have_whole_vector_shift (mode1);
5872       if (!VECTOR_MODE_P (mode1)
5873           || !directly_supported_p (code, vectype1))
5874         reduce_with_shift = false;
5875
5876       /* First reduce the vector to the desired vector size we should
5877          do shift reduction on by combining upper and lower halves.  */
5878       gimple_seq stmts = NULL;
5879       new_temp = vect_create_partial_epilog (reduc_inputs[0], vectype1,
5880                                              code, &stmts);
5881       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5882       reduc_inputs[0] = new_temp;
5883
5884       if (reduce_with_shift && !slp_reduc)
5885         {
5886           int element_bitsize = tree_to_uhwi (bitsize);
5887           /* Enforced by vectorizable_reduction, which disallows SLP reductions
5888              for variable-length vectors and also requires direct target support
5889              for loop reductions.  */
5890           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5891           int nelements = vec_size_in_bits / element_bitsize;
5892           vec_perm_builder sel;
5893           vec_perm_indices indices;
5894
5895           int elt_offset;
5896
5897           tree zero_vec = build_zero_cst (vectype1);
5898           /* Case 2: Create:
5899              for (offset = nelements/2; offset >= 1; offset/=2)
5900                 {
5901                   Create:  va' = vec_shift <va, offset>
5902                   Create:  va = vop <va, va'>
5903                 }  */
5904
5905           tree rhs;
5906
5907           if (dump_enabled_p ())
5908             dump_printf_loc (MSG_NOTE, vect_location,
5909                              "Reduce using vector shifts\n");
5910
5911           gimple_seq stmts = NULL;
5912           new_temp = gimple_convert (&stmts, vectype1, new_temp);
5913           for (elt_offset = nelements / 2;
5914                elt_offset >= 1;
5915                elt_offset /= 2)
5916             {
5917               calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5918               indices.new_vector (sel, 2, nelements);
5919               tree mask = vect_gen_perm_mask_any (vectype1, indices);
5920               new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
5921                                        new_temp, zero_vec, mask);
5922               new_temp = gimple_build (&stmts, code,
5923                                        vectype1, new_name, new_temp);
5924             }
5925           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5926
5927           /* 2.4  Extract the final scalar result.  Create:
5928              s_out3 = extract_field <v_out2, bitpos>  */
5929
5930           if (dump_enabled_p ())
5931             dump_printf_loc (MSG_NOTE, vect_location,
5932                              "extract scalar result\n");
5933
5934           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5935                         bitsize, bitsize_zero_node);
5936           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5937           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5938           gimple_assign_set_lhs (epilog_stmt, new_temp);
5939           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5940           scalar_results.safe_push (new_temp);
5941         }
5942       else
5943         {
5944           /* Case 3: Create:
5945              s = extract_field <v_out2, 0>
5946              for (offset = element_size;
5947                   offset < vector_size;
5948                   offset += element_size;)
5949                {
5950                  Create:  s' = extract_field <v_out2, offset>
5951                  Create:  s = op <s, s'>  // For non SLP cases
5952                }  */
5953
5954           if (dump_enabled_p ())
5955             dump_printf_loc (MSG_NOTE, vect_location,
5956                              "Reduce using scalar code.\n");
5957
5958           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5959           int element_bitsize = tree_to_uhwi (bitsize);
5960           tree compute_type = TREE_TYPE (vectype);
5961           gimple_seq stmts = NULL;
5962           FOR_EACH_VEC_ELT (reduc_inputs, i, vec_temp)
5963             {
5964               int bit_offset;
5965               new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
5966                                        vec_temp, bitsize, bitsize_zero_node);
5967
5968               /* In SLP we don't need to apply reduction operation, so we just
5969                  collect s' values in SCALAR_RESULTS.  */
5970               if (slp_reduc)
5971                 scalar_results.safe_push (new_temp);
5972
5973               for (bit_offset = element_bitsize;
5974                    bit_offset < vec_size_in_bits;
5975                    bit_offset += element_bitsize)
5976                 {
5977                   tree bitpos = bitsize_int (bit_offset);
5978                   new_name = gimple_build (&stmts, BIT_FIELD_REF,
5979                                            compute_type, vec_temp,
5980                                            bitsize, bitpos);
5981                   if (slp_reduc)
5982                     {
5983                       /* In SLP we don't need to apply reduction operation, so
5984                          we just collect s' values in SCALAR_RESULTS.  */
5985                       new_temp = new_name;
5986                       scalar_results.safe_push (new_name);
5987                     }
5988                   else
5989                     new_temp = gimple_build (&stmts, code, compute_type,
5990                                              new_name, new_temp);
5991                 }
5992             }
5993
5994           /* The only case where we need to reduce scalar results in SLP, is
5995              unrolling.  If the size of SCALAR_RESULTS is greater than
5996              REDUC_GROUP_SIZE, we reduce them combining elements modulo
5997              REDUC_GROUP_SIZE.  */
5998           if (slp_reduc)
5999             {
6000               tree res, first_res, new_res;
6001
6002               /* Reduce multiple scalar results in case of SLP unrolling.  */
6003               for (j = group_size; scalar_results.iterate (j, &res);
6004                    j++)
6005                 {
6006                   first_res = scalar_results[j % group_size];
6007                   new_res = gimple_build (&stmts, code, compute_type,
6008                                           first_res, res);
6009                   scalar_results[j % group_size] = new_res;
6010                 }
6011               scalar_results.truncate (group_size);
6012               for (k = 0; k < group_size; k++)
6013                 scalar_results[k] = gimple_convert (&stmts, scalar_type,
6014                                                     scalar_results[k]);
6015             }
6016           else
6017             {
6018               /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
6019               new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6020               scalar_results.safe_push (new_temp);
6021             }
6022
6023           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6024         }
6025
6026       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6027           && induc_val)
6028         {
6029           /* Earlier we set the initial value to be a vector if induc_val
6030              values.  Check the result and if it is induc_val then replace
6031              with the original initial value, unless induc_val is
6032              the same as initial_def already.  */
6033           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
6034                                   induc_val);
6035           tree initial_def = reduc_info->reduc_initial_values[0];
6036
6037           tree tmp = make_ssa_name (new_scalar_dest);
6038           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6039                                              initial_def, new_temp);
6040           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6041           scalar_results[0] = tmp;
6042         }
6043     }
6044
6045   /* 2.5 Adjust the final result by the initial value of the reduction
6046          variable. (When such adjustment is not needed, then
6047          'adjustment_def' is zero).  For example, if code is PLUS we create:
6048          new_temp = loop_exit_def + adjustment_def  */
6049
6050   if (adjustment_def)
6051     {
6052       gcc_assert (!slp_reduc);
6053       gimple_seq stmts = NULL;
6054       if (double_reduc)
6055         {
6056           gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
6057           adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
6058           new_temp = gimple_build (&stmts, code, vectype,
6059                                    reduc_inputs[0], adjustment_def);
6060         }
6061       else
6062         {
6063           new_temp = scalar_results[0];
6064           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
6065           adjustment_def = gimple_convert (&stmts, scalar_type, adjustment_def);
6066           new_temp = gimple_build (&stmts, code, scalar_type,
6067                                    new_temp, adjustment_def);
6068         }
6069
6070       epilog_stmt = gimple_seq_last_stmt (stmts);
6071       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6072       scalar_results[0] = new_temp;
6073     }
6074
6075   /* Record this operation if it could be reused by the epilogue loop.  */
6076   if (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION)
6077     loop_vinfo->reusable_accumulators.put (scalar_results[0],
6078                                            { orig_reduc_input, reduc_info });
6079
6080   if (double_reduc)
6081     loop = outer_loop;
6082
6083   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
6084           phis with new adjusted scalar results, i.e., replace use <s_out0>
6085           with use <s_out4>.
6086
6087      Transform:
6088         loop_exit:
6089           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
6090           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
6091           v_out2 = reduce <v_out1>
6092           s_out3 = extract_field <v_out2, 0>
6093           s_out4 = adjust_result <s_out3>
6094           use <s_out0>
6095           use <s_out0>
6096
6097      into:
6098
6099         loop_exit:
6100           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
6101           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
6102           v_out2 = reduce <v_out1>
6103           s_out3 = extract_field <v_out2, 0>
6104           s_out4 = adjust_result <s_out3>
6105           use <s_out4>
6106           use <s_out4> */
6107
6108   gcc_assert (live_out_stmts.size () == scalar_results.length ());
6109   for (k = 0; k < live_out_stmts.size (); k++)
6110     {
6111       stmt_vec_info scalar_stmt_info = vect_orig_stmt (live_out_stmts[k]);
6112       scalar_dest = gimple_get_lhs (scalar_stmt_info->stmt);
6113
6114       phis.create (3);
6115       /* Find the loop-closed-use at the loop exit of the original scalar
6116          result.  (The reduction result is expected to have two immediate uses,
6117          one at the latch block, and one at the loop exit).  For double
6118          reductions we are looking for exit phis of the outer loop.  */
6119       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6120         {
6121           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6122             {
6123               if (!is_gimple_debug (USE_STMT (use_p)))
6124                 phis.safe_push (USE_STMT (use_p));
6125             }
6126           else
6127             {
6128               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
6129                 {
6130                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
6131
6132                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
6133                     {
6134                       if (!flow_bb_inside_loop_p (loop,
6135                                              gimple_bb (USE_STMT (phi_use_p)))
6136                           && !is_gimple_debug (USE_STMT (phi_use_p)))
6137                         phis.safe_push (USE_STMT (phi_use_p));
6138                     }
6139                 }
6140             }
6141         }
6142
6143       FOR_EACH_VEC_ELT (phis, i, exit_phi)
6144         {
6145           /* Replace the uses:  */
6146           orig_name = PHI_RESULT (exit_phi);
6147
6148           /* Look for a single use at the target of the skip edge.  */
6149           if (unify_with_main_loop_p)
6150             {
6151               use_operand_p use_p;
6152               gimple *user;
6153               if (!single_imm_use (orig_name, &use_p, &user))
6154                 gcc_unreachable ();
6155               orig_name = gimple_get_lhs (user);
6156             }
6157
6158           scalar_result = scalar_results[k];
6159           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6160             {
6161               FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6162                 SET_USE (use_p, scalar_result);
6163               update_stmt (use_stmt);
6164             }
6165         }
6166
6167       phis.release ();
6168     }
6169 }
6170
6171 /* Return a vector of type VECTYPE that is equal to the vector select
6172    operation "MASK ? VEC : IDENTITY".  Insert the select statements
6173    before GSI.  */
6174
6175 static tree
6176 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
6177                      tree vec, tree identity)
6178 {
6179   tree cond = make_temp_ssa_name (vectype, NULL, "cond");
6180   gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
6181                                           mask, vec, identity);
6182   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6183   return cond;
6184 }
6185
6186 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6187    order, starting with LHS.  Insert the extraction statements before GSI and
6188    associate the new scalar SSA names with variable SCALAR_DEST.
6189    Return the SSA name for the result.  */
6190
6191 static tree
6192 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
6193                        tree_code code, tree lhs, tree vector_rhs)
6194 {
6195   tree vectype = TREE_TYPE (vector_rhs);
6196   tree scalar_type = TREE_TYPE (vectype);
6197   tree bitsize = TYPE_SIZE (scalar_type);
6198   unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
6199   unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
6200
6201   for (unsigned HOST_WIDE_INT bit_offset = 0;
6202        bit_offset < vec_size_in_bits;
6203        bit_offset += element_bitsize)
6204     {
6205       tree bitpos = bitsize_int (bit_offset);
6206       tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
6207                          bitsize, bitpos);
6208
6209       gassign *stmt = gimple_build_assign (scalar_dest, rhs);
6210       rhs = make_ssa_name (scalar_dest, stmt);
6211       gimple_assign_set_lhs (stmt, rhs);
6212       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6213
6214       stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
6215       tree new_name = make_ssa_name (scalar_dest, stmt);
6216       gimple_assign_set_lhs (stmt, new_name);
6217       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6218       lhs = new_name;
6219     }
6220   return lhs;
6221 }
6222
6223 /* Get a masked internal function equivalent to REDUC_FN.  VECTYPE_IN is the
6224    type of the vector input.  */
6225
6226 static internal_fn
6227 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
6228 {
6229   internal_fn mask_reduc_fn;
6230
6231   switch (reduc_fn)
6232     {
6233     case IFN_FOLD_LEFT_PLUS:
6234       mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
6235       break;
6236
6237     default:
6238       return IFN_LAST;
6239     }
6240
6241   if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
6242                                       OPTIMIZE_FOR_SPEED))
6243     return mask_reduc_fn;
6244   return IFN_LAST;
6245 }
6246
6247 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT_INFO is the
6248    statement that sets the live-out value.  REDUC_DEF_STMT is the phi
6249    statement.  CODE is the operation performed by STMT_INFO and OPS are
6250    its scalar operands.  REDUC_INDEX is the index of the operand in
6251    OPS that is set by REDUC_DEF_STMT.  REDUC_FN is the function that
6252    implements in-order reduction, or IFN_LAST if we should open-code it.
6253    VECTYPE_IN is the type of the vector input.  MASKS specifies the masks
6254    that should be used to control the operation in a fully-masked loop.  */
6255
6256 static bool
6257 vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
6258                                stmt_vec_info stmt_info,
6259                                gimple_stmt_iterator *gsi,
6260                                gimple **vec_stmt, slp_tree slp_node,
6261                                gimple *reduc_def_stmt,
6262                                tree_code code, internal_fn reduc_fn,
6263                                tree ops[3], tree vectype_in,
6264                                int reduc_index, vec_loop_masks *masks)
6265 {
6266   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6267   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6268   internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
6269
6270   int ncopies;
6271   if (slp_node)
6272     ncopies = 1;
6273   else
6274     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6275
6276   gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
6277   gcc_assert (ncopies == 1);
6278   gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
6279
6280   if (slp_node)
6281     gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
6282                           TYPE_VECTOR_SUBPARTS (vectype_in)));
6283
6284   tree op0 = ops[1 - reduc_index];
6285
6286   int group_size = 1;
6287   stmt_vec_info scalar_dest_def_info;
6288   auto_vec<tree> vec_oprnds0;
6289   if (slp_node)
6290     {
6291       auto_vec<vec<tree> > vec_defs (2);
6292       vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs);
6293       vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
6294       vec_defs[0].release ();
6295       vec_defs[1].release ();
6296       group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6297       scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
6298     }
6299   else
6300     {
6301       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
6302                                      op0, &vec_oprnds0);
6303       scalar_dest_def_info = stmt_info;
6304     }
6305
6306   tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
6307   tree scalar_type = TREE_TYPE (scalar_dest);
6308   tree reduc_var = gimple_phi_result (reduc_def_stmt);
6309
6310   int vec_num = vec_oprnds0.length ();
6311   gcc_assert (vec_num == 1 || slp_node);
6312   tree vec_elem_type = TREE_TYPE (vectype_out);
6313   gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
6314
6315   tree vector_identity = NULL_TREE;
6316   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6317     vector_identity = build_zero_cst (vectype_out);
6318
6319   tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
6320   int i;
6321   tree def0;
6322   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6323     {
6324       gimple *new_stmt;
6325       tree mask = NULL_TREE;
6326       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6327         mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
6328
6329       /* Handle MINUS by adding the negative.  */
6330       if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
6331         {
6332           tree negated = make_ssa_name (vectype_out);
6333           new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
6334           gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6335           def0 = negated;
6336         }
6337
6338       if (mask && mask_reduc_fn == IFN_LAST)
6339         def0 = merge_with_identity (gsi, mask, vectype_out, def0,
6340                                     vector_identity);
6341
6342       /* On the first iteration the input is simply the scalar phi
6343          result, and for subsequent iterations it is the output of
6344          the preceding operation.  */
6345       if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
6346         {
6347           if (mask && mask_reduc_fn != IFN_LAST)
6348             new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
6349                                                    def0, mask);
6350           else
6351             new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
6352                                                    def0);
6353           /* For chained SLP reductions the output of the previous reduction
6354              operation serves as the input of the next. For the final statement
6355              the output cannot be a temporary - we reuse the original
6356              scalar destination of the last statement.  */
6357           if (i != vec_num - 1)
6358             {
6359               gimple_set_lhs (new_stmt, scalar_dest_var);
6360               reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
6361               gimple_set_lhs (new_stmt, reduc_var);
6362             }
6363         }
6364       else
6365         {
6366           reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
6367                                              reduc_var, def0);
6368           new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6369           /* Remove the statement, so that we can use the same code paths
6370              as for statements that we've just created.  */
6371           gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6372           gsi_remove (&tmp_gsi, true);
6373         }
6374
6375       if (i == vec_num - 1)
6376         {
6377           gimple_set_lhs (new_stmt, scalar_dest);
6378           vect_finish_replace_stmt (loop_vinfo,
6379                                     scalar_dest_def_info,
6380                                     new_stmt);
6381         }
6382       else
6383         vect_finish_stmt_generation (loop_vinfo,
6384                                      scalar_dest_def_info,
6385                                      new_stmt, gsi);
6386
6387       if (slp_node)
6388         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6389       else
6390         {
6391           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
6392           *vec_stmt = new_stmt;
6393         }
6394     }
6395
6396   return true;
6397 }
6398
6399 /* Function is_nonwrapping_integer_induction.
6400
6401    Check if STMT_VINO (which is part of loop LOOP) both increments and
6402    does not cause overflow.  */
6403
6404 static bool
6405 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
6406 {
6407   gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
6408   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
6409   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
6410   tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
6411   widest_int ni, max_loop_value, lhs_max;
6412   wi::overflow_type overflow = wi::OVF_NONE;
6413
6414   /* Make sure the loop is integer based.  */
6415   if (TREE_CODE (base) != INTEGER_CST
6416       || TREE_CODE (step) != INTEGER_CST)
6417     return false;
6418
6419   /* Check that the max size of the loop will not wrap.  */
6420
6421   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6422     return true;
6423
6424   if (! max_stmt_executions (loop, &ni))
6425     return false;
6426
6427   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6428                             &overflow);
6429   if (overflow)
6430     return false;
6431
6432   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6433                             TYPE_SIGN (lhs_type), &overflow);
6434   if (overflow)
6435     return false;
6436
6437   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6438           <= TYPE_PRECISION (lhs_type));
6439 }
6440
6441 /* Check if masking can be supported by inserting a conditional expression.
6442    CODE is the code for the operation.  COND_FN is the conditional internal
6443    function, if it exists.  VECTYPE_IN is the type of the vector input.  */
6444 static bool
6445 use_mask_by_cond_expr_p (code_helper code, internal_fn cond_fn,
6446                          tree vectype_in)
6447 {
6448   if (cond_fn != IFN_LAST
6449       && direct_internal_fn_supported_p (cond_fn, vectype_in,
6450                                          OPTIMIZE_FOR_SPEED))
6451     return false;
6452
6453   if (code.is_tree_code ())
6454     switch (tree_code (code))
6455       {
6456       case DOT_PROD_EXPR:
6457       case SAD_EXPR:
6458         return true;
6459
6460       default:
6461         break;
6462       }
6463   return false;
6464 }
6465
6466 /* Insert a conditional expression to enable masked vectorization.  CODE is the
6467    code for the operation.  VOP is the array of operands.  MASK is the loop
6468    mask.  GSI is a statement iterator used to place the new conditional
6469    expression.  */
6470 static void
6471 build_vect_cond_expr (code_helper code, tree vop[3], tree mask,
6472                       gimple_stmt_iterator *gsi)
6473 {
6474   switch (tree_code (code))
6475     {
6476     case DOT_PROD_EXPR:
6477       {
6478         tree vectype = TREE_TYPE (vop[1]);
6479         tree zero = build_zero_cst (vectype);
6480         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6481         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6482                                                mask, vop[1], zero);
6483         gsi_insert_before (gsi, select, GSI_SAME_STMT);
6484         vop[1] = masked_op1;
6485         break;
6486       }
6487
6488     case SAD_EXPR:
6489       {
6490         tree vectype = TREE_TYPE (vop[1]);
6491         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6492         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6493                                                mask, vop[1], vop[0]);
6494         gsi_insert_before (gsi, select, GSI_SAME_STMT);
6495         vop[1] = masked_op1;
6496         break;
6497       }
6498
6499     default:
6500       gcc_unreachable ();
6501     }
6502 }
6503
6504 /* Function vectorizable_reduction.
6505
6506    Check if STMT_INFO performs a reduction operation that can be vectorized.
6507    If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
6508    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6509    Return true if STMT_INFO is vectorizable in this way.
6510
6511    This function also handles reduction idioms (patterns) that have been
6512    recognized in advance during vect_pattern_recog.  In this case, STMT_INFO
6513    may be of this form:
6514      X = pattern_expr (arg0, arg1, ..., X)
6515    and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
6516    sequence that had been detected and replaced by the pattern-stmt
6517    (STMT_INFO).
6518
6519    This function also handles reduction of condition expressions, for example:
6520      for (int i = 0; i < N; i++)
6521        if (a[i] < value)
6522          last = a[i];
6523    This is handled by vectorising the loop and creating an additional vector
6524    containing the loop indexes for which "a[i] < value" was true.  In the
6525    function epilogue this is reduced to a single max value and then used to
6526    index into the vector of results.
6527
6528    In some cases of reduction patterns, the type of the reduction variable X is
6529    different than the type of the other arguments of STMT_INFO.
6530    In such cases, the vectype that is used when transforming STMT_INFO into
6531    a vector stmt is different than the vectype that is used to determine the
6532    vectorization factor, because it consists of a different number of elements
6533    than the actual number of elements that are being operated upon in parallel.
6534
6535    For example, consider an accumulation of shorts into an int accumulator.
6536    On some targets it's possible to vectorize this pattern operating on 8
6537    shorts at a time (hence, the vectype for purposes of determining the
6538    vectorization factor should be V8HI); on the other hand, the vectype that
6539    is used to create the vector form is actually V4SI (the type of the result).
6540
6541    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6542    indicates what is the actual level of parallelism (V8HI in the example), so
6543    that the right vectorization factor would be derived.  This vectype
6544    corresponds to the type of arguments to the reduction stmt, and should *NOT*
6545    be used to create the vectorized stmt.  The right vectype for the vectorized
6546    stmt is obtained from the type of the result X:
6547       get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6548
6549    This means that, contrary to "regular" reductions (or "regular" stmts in
6550    general), the following equation:
6551       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6552    does *NOT* necessarily hold for reduction patterns.  */
6553
6554 bool
6555 vectorizable_reduction (loop_vec_info loop_vinfo,
6556                         stmt_vec_info stmt_info, slp_tree slp_node,
6557                         slp_instance slp_node_instance,
6558                         stmt_vector_for_cost *cost_vec)
6559 {
6560   tree vectype_in = NULL_TREE;
6561   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6562   enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
6563   stmt_vec_info cond_stmt_vinfo = NULL;
6564   int i;
6565   int ncopies;
6566   bool single_defuse_cycle = false;
6567   bool nested_cycle = false;
6568   bool double_reduc = false;
6569   int vec_num;
6570   tree tem;
6571   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6572   tree cond_reduc_val = NULL_TREE;
6573
6574   /* Make sure it was already recognized as a reduction computation.  */
6575   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
6576       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
6577       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
6578     return false;
6579
6580   /* The stmt we store reduction analysis meta on.  */
6581   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
6582   reduc_info->is_reduc_info = true;
6583
6584   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
6585     {
6586       if (is_a <gphi *> (stmt_info->stmt))
6587         {
6588           if (slp_node)
6589             {
6590               /* We eventually need to set a vector type on invariant
6591                  arguments.  */
6592               unsigned j;
6593               slp_tree child;
6594               FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
6595                 if (!vect_maybe_update_slp_op_vectype
6596                        (child, SLP_TREE_VECTYPE (slp_node)))
6597                   {
6598                     if (dump_enabled_p ())
6599                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6600                                        "incompatible vector types for "
6601                                        "invariants\n");
6602                     return false;
6603                   }
6604             }
6605           /* Analysis for double-reduction is done on the outer
6606              loop PHI, nested cycles have no further restrictions.  */
6607           STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
6608         }
6609       else
6610         STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6611       return true;
6612     }
6613
6614   stmt_vec_info orig_stmt_of_analysis = stmt_info;
6615   stmt_vec_info phi_info = stmt_info;
6616   if (!is_a <gphi *> (stmt_info->stmt))
6617     {
6618       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6619       return true;
6620     }
6621   if (slp_node)
6622     {
6623       slp_node_instance->reduc_phis = slp_node;
6624       /* ???  We're leaving slp_node to point to the PHIs, we only
6625          need it to get at the number of vector stmts which wasn't
6626          yet initialized for the instance root.  */
6627     }
6628   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
6629     stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (stmt_info));
6630   else
6631     {
6632       gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info)
6633                   == vect_double_reduction_def);
6634       use_operand_p use_p;
6635       gimple *use_stmt;
6636       bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
6637                                  &use_p, &use_stmt);
6638       gcc_assert (res);
6639       phi_info = loop_vinfo->lookup_stmt (use_stmt);
6640       stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
6641     }
6642
6643   /* PHIs should not participate in patterns.  */
6644   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6645   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
6646
6647   /* Verify following REDUC_IDX from the latch def leads us back to the PHI
6648      and compute the reduction chain length.  Discover the real
6649      reduction operation stmt on the way (stmt_info and slp_for_stmt_info).  */
6650   tree reduc_def
6651     = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
6652                              loop_latch_edge
6653                                (gimple_bb (reduc_def_phi)->loop_father));
6654   unsigned reduc_chain_length = 0;
6655   bool only_slp_reduc_chain = true;
6656   stmt_info = NULL;
6657   slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
6658   while (reduc_def != PHI_RESULT (reduc_def_phi))
6659     {
6660       stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
6661       stmt_vec_info vdef = vect_stmt_to_vectorize (def);
6662       if (STMT_VINFO_REDUC_IDX (vdef) == -1)
6663         {
6664           if (dump_enabled_p ())
6665             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6666                              "reduction chain broken by patterns.\n");
6667           return false;
6668         }
6669       if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
6670         only_slp_reduc_chain = false;
6671       /* ???  For epilogue generation live members of the chain need
6672          to point back to the PHI via their original stmt for
6673          info_for_reduction to work.  */
6674       if (STMT_VINFO_LIVE_P (vdef))
6675         STMT_VINFO_REDUC_DEF (def) = phi_info;
6676       gimple_match_op op;
6677       if (!gimple_extract_op (vdef->stmt, &op))
6678         {
6679           if (dump_enabled_p ())
6680             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6681                              "reduction chain includes unsupported"
6682                              " statement type.\n");
6683           return false;
6684         }
6685       if (CONVERT_EXPR_CODE_P (op.code))
6686         {
6687           if (!tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
6688             {
6689               if (dump_enabled_p ())
6690                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6691                                  "conversion in the reduction chain.\n");
6692               return false;
6693             }
6694         }
6695       else if (!stmt_info)
6696         /* First non-conversion stmt.  */
6697         stmt_info = vdef;
6698       reduc_def = op.ops[STMT_VINFO_REDUC_IDX (vdef)];
6699       reduc_chain_length++;
6700       if (!stmt_info && slp_node)
6701         slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
6702     }
6703   /* PHIs should not participate in patterns.  */
6704   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6705
6706   if (nested_in_vect_loop_p (loop, stmt_info))
6707     {
6708       loop = loop->inner;
6709       nested_cycle = true;
6710     }
6711
6712   /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
6713      element.  */
6714   if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6715     {
6716       gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
6717       stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
6718     }
6719   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6720     gcc_assert (slp_node
6721                 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
6722
6723   /* 1. Is vectorizable reduction?  */
6724   /* Not supportable if the reduction variable is used in the loop, unless
6725      it's a reduction chain.  */
6726   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6727       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6728     return false;
6729
6730   /* Reductions that are not used even in an enclosing outer-loop,
6731      are expected to be "live" (used out of the loop).  */
6732   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6733       && !STMT_VINFO_LIVE_P (stmt_info))
6734     return false;
6735
6736   /* 2. Has this been recognized as a reduction pattern?
6737
6738      Check if STMT represents a pattern that has been recognized
6739      in earlier analysis stages.  For stmts that represent a pattern,
6740      the STMT_VINFO_RELATED_STMT field records the last stmt in
6741      the original sequence that constitutes the pattern.  */
6742
6743   stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6744   if (orig_stmt_info)
6745     {
6746       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6747       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6748     }
6749
6750   /* 3. Check the operands of the operation.  The first operands are defined
6751         inside the loop body. The last operand is the reduction variable,
6752         which is defined by the loop-header-phi.  */
6753
6754   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6755   STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
6756   gimple_match_op op;
6757   if (!gimple_extract_op (stmt_info->stmt, &op))
6758     gcc_unreachable ();
6759   bool lane_reduc_code_p = (op.code == DOT_PROD_EXPR
6760                             || op.code == WIDEN_SUM_EXPR
6761                             || op.code == SAD_EXPR);
6762   enum optab_subtype optab_query_kind = optab_vector;
6763   if (op.code == DOT_PROD_EXPR
6764       && (TYPE_SIGN (TREE_TYPE (op.ops[0]))
6765           != TYPE_SIGN (TREE_TYPE (op.ops[1]))))
6766     optab_query_kind = optab_vector_mixed_sign;
6767
6768   if (!POINTER_TYPE_P (op.type) && !INTEGRAL_TYPE_P (op.type)
6769       && !SCALAR_FLOAT_TYPE_P (op.type))
6770     return false;
6771
6772   /* Do not try to vectorize bit-precision reductions.  */
6773   if (!type_has_mode_precision_p (op.type))
6774     return false;
6775
6776   /* For lane-reducing ops we're reducing the number of reduction PHIs
6777      which means the only use of that may be in the lane-reducing operation.  */
6778   if (lane_reduc_code_p
6779       && reduc_chain_length != 1
6780       && !only_slp_reduc_chain)
6781     {
6782       if (dump_enabled_p ())
6783         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6784                          "lane-reducing reduction with extra stmts.\n");
6785       return false;
6786     }
6787
6788   /* All uses but the last are expected to be defined in the loop.
6789      The last use is the reduction variable.  In case of nested cycle this
6790      assumption is not true: we use reduc_index to record the index of the
6791      reduction variable.  */
6792   slp_tree *slp_op = XALLOCAVEC (slp_tree, op.num_ops);
6793   /* We need to skip an extra operand for COND_EXPRs with embedded
6794      comparison.  */
6795   unsigned opno_adjust = 0;
6796   if (op.code == COND_EXPR && COMPARISON_CLASS_P (op.ops[0]))
6797     opno_adjust = 1;
6798   for (i = 0; i < (int) op.num_ops; i++)
6799     {
6800       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
6801       if (i == 0 && op.code == COND_EXPR)
6802         continue;
6803
6804       stmt_vec_info def_stmt_info;
6805       enum vect_def_type dt;
6806       if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info,
6807                                i + opno_adjust, &op.ops[i], &slp_op[i], &dt,
6808                                &tem, &def_stmt_info))
6809         {
6810           if (dump_enabled_p ())
6811             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6812                              "use not simple.\n");
6813           return false;
6814         }
6815       if (i == STMT_VINFO_REDUC_IDX (stmt_info))
6816         continue;
6817
6818       /* There should be only one cycle def in the stmt, the one
6819          leading to reduc_def.  */
6820       if (VECTORIZABLE_CYCLE_DEF (dt))
6821         return false;
6822
6823       /* To properly compute ncopies we are interested in the widest
6824          non-reduction input type in case we're looking at a widening
6825          accumulation that we later handle in vect_transform_reduction.  */
6826       if (lane_reduc_code_p
6827           && tem
6828           && (!vectype_in
6829               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6830                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem))))))
6831         vectype_in = tem;
6832
6833       if (op.code == COND_EXPR)
6834         {
6835           /* Record how the non-reduction-def value of COND_EXPR is defined.  */
6836           if (dt == vect_constant_def)
6837             {
6838               cond_reduc_dt = dt;
6839               cond_reduc_val = op.ops[i];
6840             }
6841           if (dt == vect_induction_def
6842               && def_stmt_info
6843               && is_nonwrapping_integer_induction (def_stmt_info, loop))
6844             {
6845               cond_reduc_dt = dt;
6846               cond_stmt_vinfo = def_stmt_info;
6847             }
6848         }
6849     }
6850   if (!vectype_in)
6851     vectype_in = STMT_VINFO_VECTYPE (phi_info);
6852   STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
6853
6854   enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
6855   STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
6856   /* If we have a condition reduction, see if we can simplify it further.  */
6857   if (v_reduc_type == COND_REDUCTION)
6858     {
6859       if (slp_node)
6860         return false;
6861
6862       /* When the condition uses the reduction value in the condition, fail.  */
6863       if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
6864         {
6865           if (dump_enabled_p ())
6866             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6867                              "condition depends on previous iteration\n");
6868           return false;
6869         }
6870
6871       if (reduc_chain_length == 1
6872           && direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6873                                              vectype_in, OPTIMIZE_FOR_SPEED))
6874         {
6875           if (dump_enabled_p ())
6876             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6877                              "optimizing condition reduction with"
6878                              " FOLD_EXTRACT_LAST.\n");
6879           STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
6880         }
6881       else if (cond_reduc_dt == vect_induction_def)
6882         {
6883           tree base
6884             = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6885           tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6886
6887           gcc_assert (TREE_CODE (base) == INTEGER_CST
6888                       && TREE_CODE (step) == INTEGER_CST);
6889           cond_reduc_val = NULL_TREE;
6890           enum tree_code cond_reduc_op_code = ERROR_MARK;
6891           tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
6892           if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
6893             ;
6894           /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6895              above base; punt if base is the minimum value of the type for
6896              MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
6897           else if (tree_int_cst_sgn (step) == -1)
6898             {
6899               cond_reduc_op_code = MIN_EXPR;
6900               if (tree_int_cst_sgn (base) == -1)
6901                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6902               else if (tree_int_cst_lt (base,
6903                                         TYPE_MAX_VALUE (TREE_TYPE (base))))
6904                 cond_reduc_val
6905                   = int_const_binop (PLUS_EXPR, base, integer_one_node);
6906             }
6907           else
6908             {
6909               cond_reduc_op_code = MAX_EXPR;
6910               if (tree_int_cst_sgn (base) == 1)
6911                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6912               else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6913                                         base))
6914                 cond_reduc_val
6915                   = int_const_binop (MINUS_EXPR, base, integer_one_node);
6916             }
6917           if (cond_reduc_val)
6918             {
6919               if (dump_enabled_p ())
6920                 dump_printf_loc (MSG_NOTE, vect_location,
6921                                  "condition expression based on "
6922                                  "integer induction.\n");
6923               STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
6924               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
6925                 = cond_reduc_val;
6926               STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
6927             }
6928         }
6929       else if (cond_reduc_dt == vect_constant_def)
6930         {
6931           enum vect_def_type cond_initial_dt;
6932           tree cond_initial_val = vect_phi_initial_value (reduc_def_phi);
6933           vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
6934           if (cond_initial_dt == vect_constant_def
6935               && types_compatible_p (TREE_TYPE (cond_initial_val),
6936                                      TREE_TYPE (cond_reduc_val)))
6937             {
6938               tree e = fold_binary (LE_EXPR, boolean_type_node,
6939                                     cond_initial_val, cond_reduc_val);
6940               if (e && (integer_onep (e) || integer_zerop (e)))
6941                 {
6942                   if (dump_enabled_p ())
6943                     dump_printf_loc (MSG_NOTE, vect_location,
6944                                      "condition expression based on "
6945                                      "compile time constant.\n");
6946                   /* Record reduction code at analysis stage.  */
6947                   STMT_VINFO_REDUC_CODE (reduc_info)
6948                     = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6949                   STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
6950                 }
6951             }
6952         }
6953     }
6954
6955   if (STMT_VINFO_LIVE_P (phi_info))
6956     return false;
6957
6958   if (slp_node)
6959     ncopies = 1;
6960   else
6961     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6962
6963   gcc_assert (ncopies >= 1);
6964
6965   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6966
6967   if (nested_cycle)
6968     {
6969       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
6970                   == vect_double_reduction_def);
6971       double_reduc = true;
6972     }
6973
6974   /* 4.2. Check support for the epilog operation.
6975
6976           If STMT represents a reduction pattern, then the type of the
6977           reduction variable may be different than the type of the rest
6978           of the arguments.  For example, consider the case of accumulation
6979           of shorts into an int accumulator; The original code:
6980                         S1: int_a = (int) short_a;
6981           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
6982
6983           was replaced with:
6984                         STMT: int_acc = widen_sum <short_a, int_acc>
6985
6986           This means that:
6987           1. The tree-code that is used to create the vector operation in the
6988              epilog code (that reduces the partial results) is not the
6989              tree-code of STMT, but is rather the tree-code of the original
6990              stmt from the pattern that STMT is replacing.  I.e, in the example
6991              above we want to use 'widen_sum' in the loop, but 'plus' in the
6992              epilog.
6993           2. The type (mode) we use to check available target support
6994              for the vector operation to be created in the *epilog*, is
6995              determined by the type of the reduction variable (in the example
6996              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6997              However the type (mode) we use to check available target support
6998              for the vector operation to be created *inside the loop*, is
6999              determined by the type of the other arguments to STMT (in the
7000              example we'd check this: optab_handler (widen_sum_optab,
7001              vect_short_mode)).
7002
7003           This is contrary to "regular" reductions, in which the types of all
7004           the arguments are the same as the type of the reduction variable.
7005           For "regular" reductions we can therefore use the same vector type
7006           (and also the same tree-code) when generating the epilog code and
7007           when generating the code inside the loop.  */
7008
7009   code_helper orig_code = STMT_VINFO_REDUC_CODE (phi_info);
7010   STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
7011
7012   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7013   if (reduction_type == TREE_CODE_REDUCTION)
7014     {
7015       /* Check whether it's ok to change the order of the computation.
7016          Generally, when vectorizing a reduction we change the order of the
7017          computation.  This may change the behavior of the program in some
7018          cases, so we need to check that this is ok.  One exception is when
7019          vectorizing an outer-loop: the inner-loop is executed sequentially,
7020          and therefore vectorizing reductions in the inner-loop during
7021          outer-loop vectorization is safe.  Likewise when we are vectorizing
7022          a series of reductions using SLP and the VF is one the reductions
7023          are performed in scalar order.  */
7024       if (slp_node
7025           && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7026           && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1u))
7027         ;
7028       else if (needs_fold_left_reduction_p (op.type, orig_code))
7029         {
7030           /* When vectorizing a reduction chain w/o SLP the reduction PHI
7031              is not directy used in stmt.  */
7032           if (!only_slp_reduc_chain
7033               && reduc_chain_length != 1)
7034             {
7035               if (dump_enabled_p ())
7036                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7037                                  "in-order reduction chain without SLP.\n");
7038               return false;
7039             }
7040           STMT_VINFO_REDUC_TYPE (reduc_info)
7041             = reduction_type = FOLD_LEFT_REDUCTION;
7042         }
7043       else if (!commutative_binary_op_p (orig_code, op.type)
7044                || !associative_binary_op_p (orig_code, op.type))
7045         {
7046           if (dump_enabled_p ())
7047             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7048                             "reduction: not commutative/associative");
7049           return false;
7050         }
7051     }
7052
7053   if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
7054       && ncopies > 1)
7055     {
7056       if (dump_enabled_p ())
7057         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7058                          "multiple types in double reduction or condition "
7059                          "reduction or fold-left reduction.\n");
7060       return false;
7061     }
7062
7063   internal_fn reduc_fn = IFN_LAST;
7064   if (reduction_type == TREE_CODE_REDUCTION
7065       || reduction_type == FOLD_LEFT_REDUCTION
7066       || reduction_type == INTEGER_INDUC_COND_REDUCTION
7067       || reduction_type == CONST_COND_REDUCTION)
7068     {
7069       if (reduction_type == FOLD_LEFT_REDUCTION
7070           ? fold_left_reduction_fn (orig_code, &reduc_fn)
7071           : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
7072         {
7073           if (reduc_fn != IFN_LAST
7074               && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
7075                                                   OPTIMIZE_FOR_SPEED))
7076             {
7077               if (dump_enabled_p ())
7078                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7079                                  "reduc op not supported by target.\n");
7080
7081               reduc_fn = IFN_LAST;
7082             }
7083         }
7084       else
7085         {
7086           if (!nested_cycle || double_reduc)
7087             {
7088               if (dump_enabled_p ())
7089                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7090                                  "no reduc code for scalar code.\n");
7091
7092               return false;
7093             }
7094         }
7095     }
7096   else if (reduction_type == COND_REDUCTION)
7097     {
7098       int scalar_precision
7099         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (op.type));
7100       cr_index_scalar_type = make_unsigned_type (scalar_precision);
7101       cr_index_vector_type = get_same_sized_vectype (cr_index_scalar_type,
7102                                                 vectype_out);
7103
7104       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
7105                                           OPTIMIZE_FOR_SPEED))
7106         reduc_fn = IFN_REDUC_MAX;
7107     }
7108   STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
7109
7110   if (reduction_type != EXTRACT_LAST_REDUCTION
7111       && (!nested_cycle || double_reduc)
7112       && reduc_fn == IFN_LAST
7113       && !nunits_out.is_constant ())
7114     {
7115       if (dump_enabled_p ())
7116         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7117                          "missing target support for reduction on"
7118                          " variable-length vectors.\n");
7119       return false;
7120     }
7121
7122   /* For SLP reductions, see if there is a neutral value we can use.  */
7123   tree neutral_op = NULL_TREE;
7124   if (slp_node)
7125     {
7126       tree initial_value = NULL_TREE;
7127       if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL)
7128         initial_value = vect_phi_initial_value (reduc_def_phi);
7129       neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype_out),
7130                                              orig_code, initial_value);
7131     }
7132
7133   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
7134     {
7135       /* We can't support in-order reductions of code such as this:
7136
7137            for (int i = 0; i < n1; ++i)
7138              for (int j = 0; j < n2; ++j)
7139                l += a[j];
7140
7141          since GCC effectively transforms the loop when vectorizing:
7142
7143            for (int i = 0; i < n1 / VF; ++i)
7144              for (int j = 0; j < n2; ++j)
7145                for (int k = 0; k < VF; ++k)
7146                  l += a[j];
7147
7148          which is a reassociation of the original operation.  */
7149       if (dump_enabled_p ())
7150         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7151                          "in-order double reduction not supported.\n");
7152
7153       return false;
7154     }
7155
7156   if (reduction_type == FOLD_LEFT_REDUCTION
7157       && slp_node
7158       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7159     {
7160       /* We cannot use in-order reductions in this case because there is
7161          an implicit reassociation of the operations involved.  */
7162       if (dump_enabled_p ())
7163         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7164                          "in-order unchained SLP reductions not supported.\n");
7165       return false;
7166     }
7167
7168   /* For double reductions, and for SLP reductions with a neutral value,
7169      we construct a variable-length initial vector by loading a vector
7170      full of the neutral value and then shift-and-inserting the start
7171      values into the low-numbered elements.  */
7172   if ((double_reduc || neutral_op)
7173       && !nunits_out.is_constant ()
7174       && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
7175                                           vectype_out, OPTIMIZE_FOR_SPEED))
7176     {
7177       if (dump_enabled_p ())
7178         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7179                          "reduction on variable-length vectors requires"
7180                          " target support for a vector-shift-and-insert"
7181                          " operation.\n");
7182       return false;
7183     }
7184
7185   /* Check extra constraints for variable-length unchained SLP reductions.  */
7186   if (STMT_SLP_TYPE (stmt_info)
7187       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7188       && !nunits_out.is_constant ())
7189     {
7190       /* We checked above that we could build the initial vector when
7191          there's a neutral element value.  Check here for the case in
7192          which each SLP statement has its own initial value and in which
7193          that value needs to be repeated for every instance of the
7194          statement within the initial vector.  */
7195       unsigned int group_size = SLP_TREE_LANES (slp_node);
7196       if (!neutral_op
7197           && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
7198                                               TREE_TYPE (vectype_out)))
7199         {
7200           if (dump_enabled_p ())
7201             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7202                              "unsupported form of SLP reduction for"
7203                              " variable-length vectors: cannot build"
7204                              " initial vector.\n");
7205           return false;
7206         }
7207       /* The epilogue code relies on the number of elements being a multiple
7208          of the group size.  The duplicate-and-interleave approach to setting
7209          up the initial vector does too.  */
7210       if (!multiple_p (nunits_out, group_size))
7211         {
7212           if (dump_enabled_p ())
7213             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7214                              "unsupported form of SLP reduction for"
7215                              " variable-length vectors: the vector size"
7216                              " is not a multiple of the number of results.\n");
7217           return false;
7218         }
7219     }
7220
7221   if (reduction_type == COND_REDUCTION)
7222     {
7223       widest_int ni;
7224
7225       if (! max_loop_iterations (loop, &ni))
7226         {
7227           if (dump_enabled_p ())
7228             dump_printf_loc (MSG_NOTE, vect_location,
7229                              "loop count not known, cannot create cond "
7230                              "reduction.\n");
7231           return false;
7232         }
7233       /* Convert backedges to iterations.  */
7234       ni += 1;
7235
7236       /* The additional index will be the same type as the condition.  Check
7237          that the loop can fit into this less one (because we'll use up the
7238          zero slot for when there are no matches).  */
7239       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
7240       if (wi::geu_p (ni, wi::to_widest (max_index)))
7241         {
7242           if (dump_enabled_p ())
7243             dump_printf_loc (MSG_NOTE, vect_location,
7244                              "loop size is greater than data size.\n");
7245           return false;
7246         }
7247     }
7248
7249   /* In case the vectorization factor (VF) is bigger than the number
7250      of elements that we can fit in a vectype (nunits), we have to generate
7251      more than one vector stmt - i.e - we need to "unroll" the
7252      vector stmt by a factor VF/nunits.  For more details see documentation
7253      in vectorizable_operation.  */
7254
7255   /* If the reduction is used in an outer loop we need to generate
7256      VF intermediate results, like so (e.g. for ncopies=2):
7257         r0 = phi (init, r0)
7258         r1 = phi (init, r1)
7259         r0 = x0 + r0;
7260         r1 = x1 + r1;
7261     (i.e. we generate VF results in 2 registers).
7262     In this case we have a separate def-use cycle for each copy, and therefore
7263     for each copy we get the vector def for the reduction variable from the
7264     respective phi node created for this copy.
7265
7266     Otherwise (the reduction is unused in the loop nest), we can combine
7267     together intermediate results, like so (e.g. for ncopies=2):
7268         r = phi (init, r)
7269         r = x0 + r;
7270         r = x1 + r;
7271    (i.e. we generate VF/2 results in a single register).
7272    In this case for each copy we get the vector def for the reduction variable
7273    from the vectorized reduction operation generated in the previous iteration.
7274
7275    This only works when we see both the reduction PHI and its only consumer
7276    in vectorizable_reduction and there are no intermediate stmts
7277    participating.  When unrolling we want each unrolled iteration to have its
7278    own reduction accumulator since one of the main goals of unrolling a
7279    reduction is to reduce the aggregate loop-carried latency.  */
7280   if (ncopies > 1
7281       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
7282       && reduc_chain_length == 1
7283       && loop_vinfo->suggested_unroll_factor == 1)
7284     single_defuse_cycle = true;
7285
7286   if (single_defuse_cycle || lane_reduc_code_p)
7287     {
7288       gcc_assert (op.code != COND_EXPR);
7289
7290       /* 4. Supportable by target?  */
7291       bool ok = true;
7292
7293       /* 4.1. check support for the operation in the loop  */
7294       machine_mode vec_mode = TYPE_MODE (vectype_in);
7295       if (!directly_supported_p (op.code, vectype_in, optab_query_kind))
7296         {
7297           if (dump_enabled_p ())
7298             dump_printf (MSG_NOTE, "op not supported by target.\n");
7299           if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
7300               || !vect_can_vectorize_without_simd_p (op.code))
7301             ok = false;
7302           else
7303             if (dump_enabled_p ())
7304               dump_printf (MSG_NOTE, "proceeding using word mode.\n");
7305         }
7306
7307       if (vect_emulated_vector_p (vectype_in)
7308           && !vect_can_vectorize_without_simd_p (op.code))
7309         {
7310           if (dump_enabled_p ())
7311             dump_printf (MSG_NOTE, "using word mode not possible.\n");
7312           return false;
7313         }
7314
7315       /* lane-reducing operations have to go through vect_transform_reduction.
7316          For the other cases try without the single cycle optimization.  */
7317       if (!ok)
7318         {
7319           if (lane_reduc_code_p)
7320             return false;
7321           else
7322             single_defuse_cycle = false;
7323         }
7324     }
7325   STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
7326
7327   /* If the reduction stmt is one of the patterns that have lane
7328      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
7329   if ((ncopies > 1 && ! single_defuse_cycle)
7330       && lane_reduc_code_p)
7331     {
7332       if (dump_enabled_p ())
7333         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7334                          "multi def-use cycle not possible for lane-reducing "
7335                          "reduction operation\n");
7336       return false;
7337     }
7338
7339   if (slp_node
7340       && !(!single_defuse_cycle
7341            && !lane_reduc_code_p
7342            && reduction_type != FOLD_LEFT_REDUCTION))
7343     for (i = 0; i < (int) op.num_ops; i++)
7344       if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_in))
7345         {
7346           if (dump_enabled_p ())
7347             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7348                              "incompatible vector types for invariants\n");
7349           return false;
7350         }
7351
7352   if (slp_node)
7353     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7354   else
7355     vec_num = 1;
7356
7357   vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
7358                              reduction_type, ncopies, cost_vec);
7359   /* Cost the reduction op inside the loop if transformed via
7360      vect_transform_reduction.  Otherwise this is costed by the
7361      separate vectorizable_* routines.  */
7362   if (single_defuse_cycle || lane_reduc_code_p)
7363     record_stmt_cost (cost_vec, ncopies, vector_stmt, stmt_info, 0, vect_body);
7364
7365   if (dump_enabled_p ()
7366       && reduction_type == FOLD_LEFT_REDUCTION)
7367     dump_printf_loc (MSG_NOTE, vect_location,
7368                      "using an in-order (fold-left) reduction.\n");
7369   STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
7370   /* All but single defuse-cycle optimized, lane-reducing and fold-left
7371      reductions go through their own vectorizable_* routines.  */
7372   if (!single_defuse_cycle
7373       && !lane_reduc_code_p
7374       && reduction_type != FOLD_LEFT_REDUCTION)
7375     {
7376       stmt_vec_info tem
7377         = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
7378       if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
7379         {
7380           gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
7381           tem = REDUC_GROUP_FIRST_ELEMENT (tem);
7382         }
7383       STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
7384       STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
7385     }
7386   else if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
7387     {
7388       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7389       internal_fn cond_fn = get_conditional_internal_fn (op.code, op.type);
7390
7391       if (reduction_type != FOLD_LEFT_REDUCTION
7392           && !use_mask_by_cond_expr_p (op.code, cond_fn, vectype_in)
7393           && (cond_fn == IFN_LAST
7394               || !direct_internal_fn_supported_p (cond_fn, vectype_in,
7395                                                   OPTIMIZE_FOR_SPEED)))
7396         {
7397           if (dump_enabled_p ())
7398             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7399                              "can't operate on partial vectors because"
7400                              " no conditional operation is available.\n");
7401           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7402         }
7403       else if (reduction_type == FOLD_LEFT_REDUCTION
7404                && reduc_fn == IFN_LAST
7405                && !expand_vec_cond_expr_p (vectype_in,
7406                                            truth_type_for (vectype_in),
7407                                            SSA_NAME))
7408         {
7409           if (dump_enabled_p ())
7410             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7411                              "can't operate on partial vectors because"
7412                              " no conditional operation is available.\n");
7413           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7414         }
7415       else
7416         vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
7417                                vectype_in, NULL);
7418     }
7419   return true;
7420 }
7421
7422 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
7423    value.  */
7424
7425 bool
7426 vect_transform_reduction (loop_vec_info loop_vinfo,
7427                           stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
7428                           gimple **vec_stmt, slp_tree slp_node)
7429 {
7430   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7431   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7432   int i;
7433   int ncopies;
7434   int vec_num;
7435
7436   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7437   gcc_assert (reduc_info->is_reduc_info);
7438
7439   if (nested_in_vect_loop_p (loop, stmt_info))
7440     {
7441       loop = loop->inner;
7442       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
7443     }
7444
7445   gimple_match_op op;
7446   if (!gimple_extract_op (stmt_info->stmt, &op))
7447     gcc_unreachable ();
7448   gcc_assert (op.code.is_tree_code ());
7449   auto code = tree_code (op.code);
7450
7451   /* All uses but the last are expected to be defined in the loop.
7452      The last use is the reduction variable.  In case of nested cycle this
7453      assumption is not true: we use reduc_index to record the index of the
7454      reduction variable.  */
7455   stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
7456   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
7457   int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
7458   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7459
7460   if (slp_node)
7461     {
7462       ncopies = 1;
7463       vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7464     }
7465   else
7466     {
7467       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7468       vec_num = 1;
7469     }
7470
7471   internal_fn cond_fn = get_conditional_internal_fn (code);
7472   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7473   bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
7474
7475   /* Transform.  */
7476   tree new_temp = NULL_TREE;
7477   auto_vec<tree> vec_oprnds0;
7478   auto_vec<tree> vec_oprnds1;
7479   auto_vec<tree> vec_oprnds2;
7480   tree def0;
7481
7482   if (dump_enabled_p ())
7483     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7484
7485   /* FORNOW: Multiple types are not supported for condition.  */
7486   if (code == COND_EXPR)
7487     gcc_assert (ncopies == 1);
7488
7489   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7490
7491   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7492   if (reduction_type == FOLD_LEFT_REDUCTION)
7493     {
7494       internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
7495       return vectorize_fold_left_reduction
7496           (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, code,
7497            reduc_fn, op.ops, vectype_in, reduc_index, masks);
7498     }
7499
7500   bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
7501   gcc_assert (single_defuse_cycle
7502               || code == DOT_PROD_EXPR
7503               || code == WIDEN_SUM_EXPR
7504               || code == SAD_EXPR);
7505
7506   /* Create the destination vector  */
7507   tree scalar_dest = gimple_assign_lhs (stmt_info->stmt);
7508   tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7509
7510   vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
7511                      single_defuse_cycle && reduc_index == 0
7512                      ? NULL_TREE : op.ops[0], &vec_oprnds0,
7513                      single_defuse_cycle && reduc_index == 1
7514                      ? NULL_TREE : op.ops[1], &vec_oprnds1,
7515                      op.num_ops == 3
7516                      && !(single_defuse_cycle && reduc_index == 2)
7517                      ? op.ops[2] : NULL_TREE, &vec_oprnds2);
7518   if (single_defuse_cycle)
7519     {
7520       gcc_assert (!slp_node);
7521       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7522                                      op.ops[reduc_index],
7523                                      reduc_index == 0 ? &vec_oprnds0
7524                                      : (reduc_index == 1 ? &vec_oprnds1
7525                                         : &vec_oprnds2));
7526     }
7527
7528   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7529     {
7530       gimple *new_stmt;
7531       tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7532       if (masked_loop_p && !mask_by_cond_expr)
7533         {
7534           /* Make sure that the reduction accumulator is vop[0].  */
7535           if (reduc_index == 1)
7536             {
7537               gcc_assert (commutative_tree_code (code));
7538               std::swap (vop[0], vop[1]);
7539             }
7540           tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7541                                           vectype_in, i);
7542           gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
7543                                                     vop[0], vop[1], vop[0]);
7544           new_temp = make_ssa_name (vec_dest, call);
7545           gimple_call_set_lhs (call, new_temp);
7546           gimple_call_set_nothrow (call, true);
7547           vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
7548           new_stmt = call;
7549         }
7550       else
7551         {
7552           if (op.num_ops == 3)
7553             vop[2] = vec_oprnds2[i];
7554
7555           if (masked_loop_p && mask_by_cond_expr)
7556             {
7557               tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7558                                               vectype_in, i);
7559               build_vect_cond_expr (code, vop, mask, gsi);
7560             }
7561
7562           new_stmt = gimple_build_assign (vec_dest, code,
7563                                           vop[0], vop[1], vop[2]);
7564           new_temp = make_ssa_name (vec_dest, new_stmt);
7565           gimple_assign_set_lhs (new_stmt, new_temp);
7566           vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7567         }
7568
7569       if (slp_node)
7570         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7571       else if (single_defuse_cycle
7572                && i < ncopies - 1)
7573         {
7574           if (reduc_index == 0)
7575             vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
7576           else if (reduc_index == 1)
7577             vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
7578           else if (reduc_index == 2)
7579             vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
7580         }
7581       else
7582         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
7583     }
7584
7585   if (!slp_node)
7586     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
7587
7588   return true;
7589 }
7590
7591 /* Transform phase of a cycle PHI.  */
7592
7593 bool
7594 vect_transform_cycle_phi (loop_vec_info loop_vinfo,
7595                           stmt_vec_info stmt_info, gimple **vec_stmt,
7596                           slp_tree slp_node, slp_instance slp_node_instance)
7597 {
7598   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7599   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7600   int i;
7601   int ncopies;
7602   int j;
7603   bool nested_cycle = false;
7604   int vec_num;
7605
7606   if (nested_in_vect_loop_p (loop, stmt_info))
7607     {
7608       loop = loop->inner;
7609       nested_cycle = true;
7610     }
7611
7612   stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
7613   reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
7614   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7615   gcc_assert (reduc_info->is_reduc_info);
7616
7617   if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
7618       || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
7619     /* Leave the scalar phi in place.  */
7620     return true;
7621
7622   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7623   /* For a nested cycle we do not fill the above.  */
7624   if (!vectype_in)
7625     vectype_in = STMT_VINFO_VECTYPE (stmt_info);
7626   gcc_assert (vectype_in);
7627
7628   if (slp_node)
7629     {
7630       /* The size vect_schedule_slp_instance computes is off for us.  */
7631       vec_num = vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
7632                                       * SLP_TREE_LANES (slp_node), vectype_in);
7633       ncopies = 1;
7634     }
7635   else
7636     {
7637       vec_num = 1;
7638       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7639     }
7640
7641   /* Check whether we should use a single PHI node and accumulate
7642      vectors to one before the backedge.  */
7643   if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
7644     ncopies = 1;
7645
7646   /* Create the destination vector  */
7647   gphi *phi = as_a <gphi *> (stmt_info->stmt);
7648   tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
7649                                                vectype_out);
7650
7651   /* Get the loop-entry arguments.  */
7652   tree vec_initial_def = NULL_TREE;
7653   auto_vec<tree> vec_initial_defs;
7654   if (slp_node)
7655     {
7656       vec_initial_defs.reserve (vec_num);
7657       if (nested_cycle)
7658         {
7659           unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
7660           vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
7661                              &vec_initial_defs);
7662         }
7663       else
7664         {
7665           gcc_assert (slp_node == slp_node_instance->reduc_phis);
7666           vec<tree> &initial_values = reduc_info->reduc_initial_values;
7667           vec<stmt_vec_info> &stmts = SLP_TREE_SCALAR_STMTS (slp_node);
7668
7669           unsigned int num_phis = stmts.length ();
7670           if (REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
7671             num_phis = 1;
7672           initial_values.reserve (num_phis);
7673           for (unsigned int i = 0; i < num_phis; ++i)
7674             {
7675               gphi *this_phi = as_a<gphi *> (stmts[i]->stmt);
7676               initial_values.quick_push (vect_phi_initial_value (this_phi));
7677             }
7678           if (vec_num == 1)
7679             vect_find_reusable_accumulator (loop_vinfo, reduc_info);
7680           if (!initial_values.is_empty ())
7681             {
7682               tree initial_value
7683                 = (num_phis == 1 ? initial_values[0] : NULL_TREE);
7684               code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
7685               tree neutral_op
7686                 = neutral_op_for_reduction (TREE_TYPE (vectype_out),
7687                                             code, initial_value);
7688               get_initial_defs_for_reduction (loop_vinfo, reduc_info,
7689                                               &vec_initial_defs, vec_num,
7690                                               stmts.length (), neutral_op);
7691             }
7692         }
7693     }
7694   else
7695     {
7696       /* Get at the scalar def before the loop, that defines the initial
7697          value of the reduction variable.  */
7698       tree initial_def = vect_phi_initial_value (phi);
7699       reduc_info->reduc_initial_values.safe_push (initial_def);
7700       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
7701          and we can't use zero for induc_val, use initial_def.  Similarly
7702          for REDUC_MIN and initial_def larger than the base.  */
7703       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
7704         {
7705           tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
7706           if (TREE_CODE (initial_def) == INTEGER_CST
7707               && !integer_zerop (induc_val)
7708               && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
7709                    && tree_int_cst_lt (initial_def, induc_val))
7710                   || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
7711                       && tree_int_cst_lt (induc_val, initial_def))))
7712             {
7713               induc_val = initial_def;
7714               /* Communicate we used the initial_def to epilouge
7715                  generation.  */
7716               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
7717             }
7718           vec_initial_def = build_vector_from_val (vectype_out, induc_val);
7719         }
7720       else if (nested_cycle)
7721         {
7722           /* Do not use an adjustment def as that case is not supported
7723              correctly if ncopies is not one.  */
7724           vect_get_vec_defs_for_operand (loop_vinfo, reduc_stmt_info,
7725                                          ncopies, initial_def,
7726                                          &vec_initial_defs);
7727         }
7728       else if (STMT_VINFO_REDUC_TYPE (reduc_info) == CONST_COND_REDUCTION
7729                || STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
7730         /* Fill the initial vector with the initial scalar value.  */
7731         vec_initial_def
7732           = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info,
7733                                            initial_def, initial_def);
7734       else
7735         {
7736           if (ncopies == 1)
7737             vect_find_reusable_accumulator (loop_vinfo, reduc_info);
7738           if (!reduc_info->reduc_initial_values.is_empty ())
7739             {
7740               initial_def = reduc_info->reduc_initial_values[0];
7741               code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
7742               tree neutral_op
7743                 = neutral_op_for_reduction (TREE_TYPE (initial_def),
7744                                             code, initial_def);
7745               gcc_assert (neutral_op);
7746               /* Try to simplify the vector initialization by applying an
7747                  adjustment after the reduction has been performed.  */
7748               if (!reduc_info->reused_accumulator
7749                   && STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
7750                   && !operand_equal_p (neutral_op, initial_def))
7751                 {
7752                   STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info)
7753                     = initial_def;
7754                   initial_def = neutral_op;
7755                 }
7756               vec_initial_def
7757                 = get_initial_def_for_reduction (loop_vinfo, reduc_info,
7758                                                  initial_def, neutral_op);
7759             }
7760         }
7761     }
7762
7763   if (vec_initial_def)
7764     {
7765       vec_initial_defs.create (ncopies);
7766       for (i = 0; i < ncopies; ++i)
7767         vec_initial_defs.quick_push (vec_initial_def);
7768     }
7769
7770   if (auto *accumulator = reduc_info->reused_accumulator)
7771     {
7772       tree def = accumulator->reduc_input;
7773       if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
7774         {
7775           unsigned int nreduc;
7776           bool res = constant_multiple_p (TYPE_VECTOR_SUBPARTS
7777                                             (TREE_TYPE (def)),
7778                                           TYPE_VECTOR_SUBPARTS (vectype_out),
7779                                           &nreduc);
7780           gcc_assert (res);
7781           gimple_seq stmts = NULL;
7782           /* Reduce the single vector to a smaller one.  */
7783           if (nreduc != 1)
7784             {
7785               /* Perform the reduction in the appropriate type.  */
7786               tree rvectype = vectype_out;
7787               if (!useless_type_conversion_p (TREE_TYPE (vectype_out),
7788                                               TREE_TYPE (TREE_TYPE (def))))
7789                 rvectype = build_vector_type (TREE_TYPE (TREE_TYPE (def)),
7790                                               TYPE_VECTOR_SUBPARTS
7791                                                 (vectype_out));
7792               def = vect_create_partial_epilog (def, rvectype,
7793                                                 STMT_VINFO_REDUC_CODE
7794                                                   (reduc_info),
7795                                                 &stmts);
7796             }
7797           /* The epilogue loop might use a different vector mode, like
7798              VNx2DI vs. V2DI.  */
7799           if (TYPE_MODE (vectype_out) != TYPE_MODE (TREE_TYPE (def)))
7800             {
7801               tree reduc_type = build_vector_type_for_mode
7802                 (TREE_TYPE (TREE_TYPE (def)), TYPE_MODE (vectype_out));
7803               def = gimple_convert (&stmts, reduc_type, def);
7804             }
7805           /* Adjust the input so we pick up the partially reduced value
7806              for the skip edge in vect_create_epilog_for_reduction.  */
7807           accumulator->reduc_input = def;
7808           /* And the reduction could be carried out using a different sign.  */
7809           if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
7810             def = gimple_convert (&stmts, vectype_out, def);
7811           if (loop_vinfo->main_loop_edge)
7812             {
7813               /* While we'd like to insert on the edge this will split
7814                  blocks and disturb bookkeeping, we also will eventually
7815                  need this on the skip edge.  Rely on sinking to
7816                  fixup optimal placement and insert in the pred.  */
7817               gimple_stmt_iterator gsi
7818                 = gsi_last_bb (loop_vinfo->main_loop_edge->src);
7819               /* Insert before a cond that eventually skips the
7820                  epilogue.  */
7821               if (!gsi_end_p (gsi) && stmt_ends_bb_p (gsi_stmt (gsi)))
7822                 gsi_prev (&gsi);
7823               gsi_insert_seq_after (&gsi, stmts, GSI_CONTINUE_LINKING);
7824             }
7825           else
7826             gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop),
7827                                               stmts);
7828         }
7829       if (loop_vinfo->main_loop_edge)
7830         vec_initial_defs[0]
7831           = vect_get_main_loop_result (loop_vinfo, def,
7832                                        vec_initial_defs[0]);
7833       else
7834         vec_initial_defs.safe_push (def);
7835     }
7836
7837   /* Generate the reduction PHIs upfront.  */
7838   for (i = 0; i < vec_num; i++)
7839     {
7840       tree vec_init_def = vec_initial_defs[i];
7841       for (j = 0; j < ncopies; j++)
7842         {
7843           /* Create the reduction-phi that defines the reduction
7844              operand.  */
7845           gphi *new_phi = create_phi_node (vec_dest, loop->header);
7846
7847           /* Set the loop-entry arg of the reduction-phi.  */
7848           if (j != 0 && nested_cycle)
7849             vec_init_def = vec_initial_defs[j];
7850           add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
7851                        UNKNOWN_LOCATION);
7852
7853           /* The loop-latch arg is set in epilogue processing.  */
7854
7855           if (slp_node)
7856             SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
7857           else
7858             {
7859               if (j == 0)
7860                 *vec_stmt = new_phi;
7861               STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
7862             }
7863         }
7864     }
7865
7866   return true;
7867 }
7868
7869 /* Vectorizes LC PHIs.  */
7870
7871 bool
7872 vectorizable_lc_phi (loop_vec_info loop_vinfo,
7873                      stmt_vec_info stmt_info, gimple **vec_stmt,
7874                      slp_tree slp_node)
7875 {
7876   if (!loop_vinfo
7877       || !is_a <gphi *> (stmt_info->stmt)
7878       || gimple_phi_num_args (stmt_info->stmt) != 1)
7879     return false;
7880
7881   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
7882       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
7883     return false;
7884
7885   if (!vec_stmt) /* transformation not required.  */
7886     {
7887       /* Deal with copies from externs or constants that disguise as
7888          loop-closed PHI nodes (PR97886).  */
7889       if (slp_node
7890           && !vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0],
7891                                                 SLP_TREE_VECTYPE (slp_node)))
7892         {
7893           if (dump_enabled_p ())
7894             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7895                              "incompatible vector types for invariants\n");
7896           return false;
7897         }
7898       STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
7899       return true;
7900     }
7901
7902   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7903   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
7904   basic_block bb = gimple_bb (stmt_info->stmt);
7905   edge e = single_pred_edge (bb);
7906   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
7907   auto_vec<tree> vec_oprnds;
7908   vect_get_vec_defs (loop_vinfo, stmt_info, slp_node,
7909                      !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1,
7910                      gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
7911   for (unsigned i = 0; i < vec_oprnds.length (); i++)
7912     {
7913       /* Create the vectorized LC PHI node.  */
7914       gphi *new_phi = create_phi_node (vec_dest, bb);
7915       add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
7916       if (slp_node)
7917         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
7918       else
7919         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
7920     }
7921   if (!slp_node)
7922     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
7923
7924   return true;
7925 }
7926
7927 /* Vectorizes PHIs.  */
7928
7929 bool
7930 vectorizable_phi (vec_info *,
7931                   stmt_vec_info stmt_info, gimple **vec_stmt,
7932                   slp_tree slp_node, stmt_vector_for_cost *cost_vec)
7933 {
7934   if (!is_a <gphi *> (stmt_info->stmt) || !slp_node)
7935     return false;
7936
7937   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
7938     return false;
7939
7940   tree vectype = SLP_TREE_VECTYPE (slp_node);
7941
7942   if (!vec_stmt) /* transformation not required.  */
7943     {
7944       slp_tree child;
7945       unsigned i;
7946       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
7947         if (!child)
7948           {
7949             if (dump_enabled_p ())
7950               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7951                                "PHI node with unvectorized backedge def\n");
7952             return false;
7953           }
7954         else if (!vect_maybe_update_slp_op_vectype (child, vectype))
7955           {
7956             if (dump_enabled_p ())
7957               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7958                                "incompatible vector types for invariants\n");
7959             return false;
7960           }
7961         else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
7962                  && !useless_type_conversion_p (vectype,
7963                                                 SLP_TREE_VECTYPE (child)))
7964           {
7965             /* With bools we can have mask and non-mask precision vectors
7966                or different non-mask precisions.  while pattern recog is
7967                supposed to guarantee consistency here bugs in it can cause
7968                mismatches (PR103489 and PR103800 for example).
7969                Deal with them here instead of ICEing later.  */
7970             if (dump_enabled_p ())
7971               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7972                                "incompatible vector type setup from "
7973                                "bool pattern detection\n");
7974             return false;
7975           }
7976
7977       /* For single-argument PHIs assume coalescing which means zero cost
7978          for the scalar and the vector PHIs.  This avoids artificially
7979          favoring the vector path (but may pessimize it in some cases).  */
7980       if (gimple_phi_num_args (as_a <gphi *> (stmt_info->stmt)) > 1)
7981         record_stmt_cost (cost_vec, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
7982                           vector_stmt, stmt_info, vectype, 0, vect_body);
7983       STMT_VINFO_TYPE (stmt_info) = phi_info_type;
7984       return true;
7985     }
7986
7987   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
7988   basic_block bb = gimple_bb (stmt_info->stmt);
7989   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
7990   auto_vec<gphi *> new_phis;
7991   for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i)
7992     {
7993       slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
7994
7995       /* Skip not yet vectorized defs.  */
7996       if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
7997           && SLP_TREE_VEC_STMTS (child).is_empty ())
7998         continue;
7999
8000       auto_vec<tree> vec_oprnds;
8001       vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
8002       if (!new_phis.exists ())
8003         {
8004           new_phis.create (vec_oprnds.length ());
8005           for (unsigned j = 0; j < vec_oprnds.length (); j++)
8006             {
8007               /* Create the vectorized LC PHI node.  */
8008               new_phis.quick_push (create_phi_node (vec_dest, bb));
8009               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phis[j]);
8010             }
8011         }
8012       edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i);
8013       for (unsigned j = 0; j < vec_oprnds.length (); j++)
8014         add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
8015     }
8016   /* We should have at least one already vectorized child.  */
8017   gcc_assert (new_phis.exists ());
8018
8019   return true;
8020 }
8021
8022 /* Return true if VECTYPE represents a vector that requires lowering
8023    by the vector lowering pass.  */
8024
8025 bool
8026 vect_emulated_vector_p (tree vectype)
8027 {
8028   return (!VECTOR_MODE_P (TYPE_MODE (vectype))
8029           && (!VECTOR_BOOLEAN_TYPE_P (vectype)
8030               || TYPE_PRECISION (TREE_TYPE (vectype)) != 1));
8031 }
8032
8033 /* Return true if we can emulate CODE on an integer mode representation
8034    of a vector.  */
8035
8036 bool
8037 vect_can_vectorize_without_simd_p (tree_code code)
8038 {
8039   switch (code)
8040     {
8041     case PLUS_EXPR:
8042     case MINUS_EXPR:
8043     case NEGATE_EXPR:
8044     case BIT_AND_EXPR:
8045     case BIT_IOR_EXPR:
8046     case BIT_XOR_EXPR:
8047     case BIT_NOT_EXPR:
8048       return true;
8049
8050     default:
8051       return false;
8052     }
8053 }
8054
8055 /* Likewise, but taking a code_helper.  */
8056
8057 bool
8058 vect_can_vectorize_without_simd_p (code_helper code)
8059 {
8060   return (code.is_tree_code ()
8061           && vect_can_vectorize_without_simd_p (tree_code (code)));
8062 }
8063
8064 /* Function vectorizable_induction
8065
8066    Check if STMT_INFO performs an induction computation that can be vectorized.
8067    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
8068    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
8069    Return true if STMT_INFO is vectorizable in this way.  */
8070
8071 bool
8072 vectorizable_induction (loop_vec_info loop_vinfo,
8073                         stmt_vec_info stmt_info,
8074                         gimple **vec_stmt, slp_tree slp_node,
8075                         stmt_vector_for_cost *cost_vec)
8076 {
8077   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8078   unsigned ncopies;
8079   bool nested_in_vect_loop = false;
8080   class loop *iv_loop;
8081   tree vec_def;
8082   edge pe = loop_preheader_edge (loop);
8083   basic_block new_bb;
8084   tree new_vec, vec_init, vec_step, t;
8085   tree new_name;
8086   gimple *new_stmt;
8087   gphi *induction_phi;
8088   tree induc_def, vec_dest;
8089   tree init_expr, step_expr;
8090   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8091   unsigned i;
8092   tree expr;
8093   gimple_stmt_iterator si;
8094
8095   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
8096   if (!phi)
8097     return false;
8098
8099   if (!STMT_VINFO_RELEVANT_P (stmt_info))
8100     return false;
8101
8102   /* Make sure it was recognized as induction computation.  */
8103   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
8104     return false;
8105
8106   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8107   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8108
8109   if (slp_node)
8110     ncopies = 1;
8111   else
8112     ncopies = vect_get_num_copies (loop_vinfo, vectype);
8113   gcc_assert (ncopies >= 1);
8114
8115   /* FORNOW. These restrictions should be relaxed.  */
8116   if (nested_in_vect_loop_p (loop, stmt_info))
8117     {
8118       imm_use_iterator imm_iter;
8119       use_operand_p use_p;
8120       gimple *exit_phi;
8121       edge latch_e;
8122       tree loop_arg;
8123
8124       if (ncopies > 1)
8125         {
8126           if (dump_enabled_p ())
8127             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8128                              "multiple types in nested loop.\n");
8129           return false;
8130         }
8131
8132       exit_phi = NULL;
8133       latch_e = loop_latch_edge (loop->inner);
8134       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
8135       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
8136         {
8137           gimple *use_stmt = USE_STMT (use_p);
8138           if (is_gimple_debug (use_stmt))
8139             continue;
8140
8141           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
8142             {
8143               exit_phi = use_stmt;
8144               break;
8145             }
8146         }
8147       if (exit_phi)
8148         {
8149           stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
8150           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
8151                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
8152             {
8153               if (dump_enabled_p ())
8154                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8155                                  "inner-loop induction only used outside "
8156                                  "of the outer vectorized loop.\n");
8157               return false;
8158             }
8159         }
8160
8161       nested_in_vect_loop = true;
8162       iv_loop = loop->inner;
8163     }
8164   else
8165     iv_loop = loop;
8166   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
8167
8168   if (slp_node && !nunits.is_constant ())
8169     {
8170       /* The current SLP code creates the step value element-by-element.  */
8171       if (dump_enabled_p ())
8172         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8173                          "SLP induction not supported for variable-length"
8174                          " vectors.\n");
8175       return false;
8176     }
8177
8178   if (FLOAT_TYPE_P (vectype) && !param_vect_induction_float)
8179     {
8180       if (dump_enabled_p ())
8181         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8182                          "floating point induction vectorization disabled\n");
8183       return false;
8184     }
8185
8186   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
8187   gcc_assert (step_expr != NULL_TREE);
8188   tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
8189
8190   /* Check for backend support of PLUS/MINUS_EXPR. */
8191   if (!directly_supported_p (PLUS_EXPR, step_vectype)
8192       || !directly_supported_p (MINUS_EXPR, step_vectype))
8193     return false;
8194
8195   if (!vec_stmt) /* transformation not required.  */
8196     {
8197       unsigned inside_cost = 0, prologue_cost = 0;
8198       if (slp_node)
8199         {
8200           /* We eventually need to set a vector type on invariant
8201              arguments.  */
8202           unsigned j;
8203           slp_tree child;
8204           FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
8205             if (!vect_maybe_update_slp_op_vectype
8206                 (child, SLP_TREE_VECTYPE (slp_node)))
8207               {
8208                 if (dump_enabled_p ())
8209                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8210                                    "incompatible vector types for "
8211                                    "invariants\n");
8212                 return false;
8213               }
8214           /* loop cost for vec_loop.  */
8215           inside_cost
8216             = record_stmt_cost (cost_vec,
8217                                 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
8218                                 vector_stmt, stmt_info, 0, vect_body);
8219           /* prologue cost for vec_init (if not nested) and step.  */
8220           prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
8221                                             scalar_to_vec,
8222                                             stmt_info, 0, vect_prologue);
8223         }
8224       else /* if (!slp_node) */
8225         {
8226           /* loop cost for vec_loop.  */
8227           inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
8228                                           stmt_info, 0, vect_body);
8229           /* prologue cost for vec_init and vec_step.  */
8230           prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
8231                                             stmt_info, 0, vect_prologue);
8232         }
8233       if (dump_enabled_p ())
8234         dump_printf_loc (MSG_NOTE, vect_location,
8235                          "vect_model_induction_cost: inside_cost = %d, "
8236                          "prologue_cost = %d .\n", inside_cost,
8237                          prologue_cost);
8238
8239       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
8240       DUMP_VECT_SCOPE ("vectorizable_induction");
8241       return true;
8242     }
8243
8244   /* Transform.  */
8245
8246   /* Compute a vector variable, initialized with the first VF values of
8247      the induction variable.  E.g., for an iv with IV_PHI='X' and
8248      evolution S, for a vector of 4 units, we want to compute:
8249      [X, X + S, X + 2*S, X + 3*S].  */
8250
8251   if (dump_enabled_p ())
8252     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
8253
8254   pe = loop_preheader_edge (iv_loop);
8255   /* Find the first insertion point in the BB.  */
8256   basic_block bb = gimple_bb (phi);
8257   si = gsi_after_labels (bb);
8258
8259   /* For SLP induction we have to generate several IVs as for example
8260      with group size 3 we need
8261        [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
8262        [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2].  */
8263   if (slp_node)
8264     {
8265       /* Enforced above.  */
8266       unsigned int const_nunits = nunits.to_constant ();
8267
8268       /* The initial values are vectorized, but any lanes > group_size
8269          need adjustment.  */
8270       slp_tree init_node
8271         = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
8272
8273       /* Gather steps.  Since we do not vectorize inductions as
8274          cycles we have to reconstruct the step from SCEV data.  */
8275       unsigned group_size = SLP_TREE_LANES (slp_node);
8276       tree *steps = XALLOCAVEC (tree, group_size);
8277       tree *inits = XALLOCAVEC (tree, group_size);
8278       stmt_vec_info phi_info;
8279       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
8280         {
8281           steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
8282           if (!init_node)
8283             inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt),
8284                                            pe->dest_idx);
8285         }
8286
8287       /* Now generate the IVs.  */
8288       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8289       gcc_assert ((const_nunits * nvects) % group_size == 0);
8290       unsigned nivs;
8291       if (nested_in_vect_loop)
8292         nivs = nvects;
8293       else
8294         {
8295           /* Compute the number of distinct IVs we need.  First reduce
8296              group_size if it is a multiple of const_nunits so we get
8297              one IV for a group_size of 4 but const_nunits 2.  */
8298           unsigned group_sizep = group_size;
8299           if (group_sizep % const_nunits == 0)
8300             group_sizep = group_sizep / const_nunits;
8301           nivs = least_common_multiple (group_sizep,
8302                                         const_nunits) / const_nunits;
8303         }
8304       tree stept = TREE_TYPE (step_vectype);
8305       tree lupdate_mul = NULL_TREE;
8306       if (!nested_in_vect_loop)
8307         {
8308           /* The number of iterations covered in one vector iteration.  */
8309           unsigned lup_mul = (nvects * const_nunits) / group_size;
8310           lupdate_mul
8311             = build_vector_from_val (step_vectype,
8312                                      SCALAR_FLOAT_TYPE_P (stept)
8313                                      ? build_real_from_wide (stept, lup_mul,
8314                                                              UNSIGNED)
8315                                      : build_int_cstu (stept, lup_mul));
8316         }
8317       tree peel_mul = NULL_TREE;
8318       gimple_seq init_stmts = NULL;
8319       if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
8320         {
8321           if (SCALAR_FLOAT_TYPE_P (stept))
8322             peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept,
8323                                      LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
8324           else
8325             peel_mul = gimple_convert (&init_stmts, stept,
8326                                        LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
8327           peel_mul = gimple_build_vector_from_val (&init_stmts,
8328                                                    step_vectype, peel_mul);
8329         }
8330       unsigned ivn;
8331       auto_vec<tree> vec_steps;
8332       for (ivn = 0; ivn < nivs; ++ivn)
8333         {
8334           tree_vector_builder step_elts (step_vectype, const_nunits, 1);
8335           tree_vector_builder init_elts (vectype, const_nunits, 1);
8336           tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
8337           for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
8338             {
8339               /* The scalar steps of the IVs.  */
8340               tree elt = steps[(ivn*const_nunits + eltn) % group_size];
8341               elt = gimple_convert (&init_stmts, TREE_TYPE (step_vectype), elt);
8342               step_elts.quick_push (elt);
8343               if (!init_node)
8344                 {
8345                   /* The scalar inits of the IVs if not vectorized.  */
8346                   elt = inits[(ivn*const_nunits + eltn) % group_size];
8347                   if (!useless_type_conversion_p (TREE_TYPE (vectype),
8348                                                   TREE_TYPE (elt)))
8349                     elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
8350                                         TREE_TYPE (vectype), elt);
8351                   init_elts.quick_push (elt);
8352                 }
8353               /* The number of steps to add to the initial values.  */
8354               unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
8355               mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
8356                                    ? build_real_from_wide (stept,
8357                                                            mul_elt, UNSIGNED)
8358                                    : build_int_cstu (stept, mul_elt));
8359             }
8360           vec_step = gimple_build_vector (&init_stmts, &step_elts);
8361           vec_steps.safe_push (vec_step);
8362           tree step_mul = gimple_build_vector (&init_stmts, &mul_elts);
8363           if (peel_mul)
8364             step_mul = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
8365                                      step_mul, peel_mul);
8366           if (!init_node)
8367             vec_init = gimple_build_vector (&init_stmts, &init_elts);
8368
8369           /* Create the induction-phi that defines the induction-operand.  */
8370           vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
8371                                             "vec_iv_");
8372           induction_phi = create_phi_node (vec_dest, iv_loop->header);
8373           induc_def = PHI_RESULT (induction_phi);
8374
8375           /* Create the iv update inside the loop  */
8376           tree up = vec_step;
8377           if (lupdate_mul)
8378             up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
8379                                vec_step, lupdate_mul);
8380           gimple_seq stmts = NULL;
8381           vec_def = gimple_convert (&stmts, step_vectype, induc_def);
8382           vec_def = gimple_build (&stmts,
8383                                   PLUS_EXPR, step_vectype, vec_def, up);
8384           vec_def = gimple_convert (&stmts, vectype, vec_def);
8385           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8386           add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
8387                        UNKNOWN_LOCATION);
8388
8389           if (init_node)
8390             vec_init = vect_get_slp_vect_def (init_node, ivn);
8391           if (!nested_in_vect_loop
8392               && !integer_zerop (step_mul))
8393             {
8394               vec_def = gimple_convert (&init_stmts, step_vectype, vec_init);
8395               up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
8396                                  vec_step, step_mul);
8397               vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
8398                                       vec_def, up);
8399               vec_init = gimple_convert (&init_stmts, vectype, vec_def);
8400             }
8401
8402           /* Set the arguments of the phi node:  */
8403           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
8404
8405           SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
8406         }
8407       if (!nested_in_vect_loop)
8408         {
8409           /* Fill up to the number of vectors we need for the whole group.  */
8410           nivs = least_common_multiple (group_size,
8411                                         const_nunits) / const_nunits;
8412           vec_steps.reserve (nivs-ivn);
8413           for (; ivn < nivs; ++ivn)
8414             {
8415               SLP_TREE_VEC_STMTS (slp_node)
8416                 .quick_push (SLP_TREE_VEC_STMTS (slp_node)[0]);
8417               vec_steps.quick_push (vec_steps[0]);
8418             }
8419         }
8420
8421       /* Re-use IVs when we can.  We are generating further vector
8422          stmts by adding VF' * stride to the IVs generated above.  */
8423       if (ivn < nvects)
8424         {
8425           unsigned vfp
8426             = least_common_multiple (group_size, const_nunits) / group_size;
8427           tree lupdate_mul
8428             = build_vector_from_val (step_vectype,
8429                                      SCALAR_FLOAT_TYPE_P (stept)
8430                                      ? build_real_from_wide (stept,
8431                                                              vfp, UNSIGNED)
8432                                      : build_int_cstu (stept, vfp));
8433           for (; ivn < nvects; ++ivn)
8434             {
8435               gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
8436               tree def = gimple_get_lhs (iv);
8437               if (ivn < 2*nivs)
8438                 vec_steps[ivn - nivs]
8439                   = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
8440                                   vec_steps[ivn - nivs], lupdate_mul);
8441               gimple_seq stmts = NULL;
8442               def = gimple_convert (&stmts, step_vectype, def);
8443               def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
8444                                   def, vec_steps[ivn % nivs]);
8445               def = gimple_convert (&stmts, vectype, def);
8446               if (gimple_code (iv) == GIMPLE_PHI)
8447                 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8448               else
8449                 {
8450                   gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
8451                   gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
8452                 }
8453               SLP_TREE_VEC_STMTS (slp_node)
8454                 .quick_push (SSA_NAME_DEF_STMT (def));
8455             }
8456         }
8457
8458       new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts);
8459       gcc_assert (!new_bb);
8460
8461       return true;
8462     }
8463
8464   init_expr = vect_phi_initial_value (phi);
8465
8466   gimple_seq stmts = NULL;
8467   if (!nested_in_vect_loop)
8468     {
8469       /* Convert the initial value to the IV update type.  */
8470       tree new_type = TREE_TYPE (step_expr);
8471       init_expr = gimple_convert (&stmts, new_type, init_expr);
8472
8473       /* If we are using the loop mask to "peel" for alignment then we need
8474          to adjust the start value here.  */
8475       tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
8476       if (skip_niters != NULL_TREE)
8477         {
8478           if (FLOAT_TYPE_P (vectype))
8479             skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
8480                                         skip_niters);
8481           else
8482             skip_niters = gimple_convert (&stmts, new_type, skip_niters);
8483           tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
8484                                          skip_niters, step_expr);
8485           init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
8486                                     init_expr, skip_step);
8487         }
8488     }
8489
8490   if (stmts)
8491     {
8492       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
8493       gcc_assert (!new_bb);
8494     }
8495
8496   /* Create the vector that holds the initial_value of the induction.  */
8497   if (nested_in_vect_loop)
8498     {
8499       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
8500          been created during vectorization of previous stmts.  We obtain it
8501          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
8502       auto_vec<tree> vec_inits;
8503       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
8504                                      init_expr, &vec_inits);
8505       vec_init = vec_inits[0];
8506       /* If the initial value is not of proper type, convert it.  */
8507       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
8508         {
8509           new_stmt
8510             = gimple_build_assign (vect_get_new_ssa_name (vectype,
8511                                                           vect_simple_var,
8512                                                           "vec_iv_"),
8513                                    VIEW_CONVERT_EXPR,
8514                                    build1 (VIEW_CONVERT_EXPR, vectype,
8515                                            vec_init));
8516           vec_init = gimple_assign_lhs (new_stmt);
8517           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
8518                                                  new_stmt);
8519           gcc_assert (!new_bb);
8520         }
8521     }
8522   else
8523     {
8524       /* iv_loop is the loop to be vectorized. Create:
8525          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
8526       stmts = NULL;
8527       new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
8528
8529       unsigned HOST_WIDE_INT const_nunits;
8530       if (nunits.is_constant (&const_nunits))
8531         {
8532           tree_vector_builder elts (step_vectype, const_nunits, 1);
8533           elts.quick_push (new_name);
8534           for (i = 1; i < const_nunits; i++)
8535             {
8536               /* Create: new_name_i = new_name + step_expr  */
8537               new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
8538                                        new_name, step_expr);
8539               elts.quick_push (new_name);
8540             }
8541           /* Create a vector from [new_name_0, new_name_1, ...,
8542              new_name_nunits-1]  */
8543           vec_init = gimple_build_vector (&stmts, &elts);
8544         }
8545       else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
8546         /* Build the initial value directly from a VEC_SERIES_EXPR.  */
8547         vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
8548                                  new_name, step_expr);
8549       else
8550         {
8551           /* Build:
8552                 [base, base, base, ...]
8553                 + (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
8554           gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
8555           gcc_assert (flag_associative_math);
8556           tree index = build_index_vector (step_vectype, 0, 1);
8557           tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
8558                                                         new_name);
8559           tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
8560                                                         step_expr);
8561           vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
8562           vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
8563                                    vec_init, step_vec);
8564           vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
8565                                    vec_init, base_vec);
8566         }
8567       vec_init = gimple_convert (&stmts, vectype, vec_init);
8568
8569       if (stmts)
8570         {
8571           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
8572           gcc_assert (!new_bb);
8573         }
8574     }
8575
8576
8577   /* Create the vector that holds the step of the induction.  */
8578   if (nested_in_vect_loop)
8579     /* iv_loop is nested in the loop to be vectorized. Generate:
8580        vec_step = [S, S, S, S]  */
8581     new_name = step_expr;
8582   else
8583     {
8584       /* iv_loop is the loop to be vectorized. Generate:
8585           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
8586       gimple_seq seq = NULL;
8587       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
8588         {
8589           expr = build_int_cst (integer_type_node, vf);
8590           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
8591         }
8592       else
8593         expr = build_int_cst (TREE_TYPE (step_expr), vf);
8594       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
8595                                expr, step_expr);
8596       if (seq)
8597         {
8598           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
8599           gcc_assert (!new_bb);
8600         }
8601     }
8602
8603   t = unshare_expr (new_name);
8604   gcc_assert (CONSTANT_CLASS_P (new_name)
8605               || TREE_CODE (new_name) == SSA_NAME);
8606   new_vec = build_vector_from_val (step_vectype, t);
8607   vec_step = vect_init_vector (loop_vinfo, stmt_info,
8608                                new_vec, step_vectype, NULL);
8609
8610
8611   /* Create the following def-use cycle:
8612      loop prolog:
8613          vec_init = ...
8614          vec_step = ...
8615      loop:
8616          vec_iv = PHI <vec_init, vec_loop>
8617          ...
8618          STMT
8619          ...
8620          vec_loop = vec_iv + vec_step;  */
8621
8622   /* Create the induction-phi that defines the induction-operand.  */
8623   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
8624   induction_phi = create_phi_node (vec_dest, iv_loop->header);
8625   induc_def = PHI_RESULT (induction_phi);
8626
8627   /* Create the iv update inside the loop  */
8628   stmts = NULL;
8629   vec_def = gimple_convert (&stmts, step_vectype, induc_def);
8630   vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
8631   vec_def = gimple_convert (&stmts, vectype, vec_def);
8632   gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8633   new_stmt = SSA_NAME_DEF_STMT (vec_def);
8634
8635   /* Set the arguments of the phi node:  */
8636   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
8637   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
8638                UNKNOWN_LOCATION);
8639
8640   STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
8641   *vec_stmt = induction_phi;
8642
8643   /* In case that vectorization factor (VF) is bigger than the number
8644      of elements that we can fit in a vectype (nunits), we have to generate
8645      more than one vector stmt - i.e - we need to "unroll" the
8646      vector stmt by a factor VF/nunits.  For more details see documentation
8647      in vectorizable_operation.  */
8648
8649   if (ncopies > 1)
8650     {
8651       gimple_seq seq = NULL;
8652       /* FORNOW. This restriction should be relaxed.  */
8653       gcc_assert (!nested_in_vect_loop);
8654
8655       /* Create the vector that holds the step of the induction.  */
8656       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
8657         {
8658           expr = build_int_cst (integer_type_node, nunits);
8659           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
8660         }
8661       else
8662         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
8663       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
8664                                expr, step_expr);
8665       if (seq)
8666         {
8667           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
8668           gcc_assert (!new_bb);
8669         }
8670
8671       t = unshare_expr (new_name);
8672       gcc_assert (CONSTANT_CLASS_P (new_name)
8673                   || TREE_CODE (new_name) == SSA_NAME);
8674       new_vec = build_vector_from_val (step_vectype, t);
8675       vec_step = vect_init_vector (loop_vinfo, stmt_info,
8676                                    new_vec, step_vectype, NULL);
8677
8678       vec_def = induc_def;
8679       for (i = 1; i < ncopies; i++)
8680         {
8681           /* vec_i = vec_prev + vec_step  */
8682           gimple_seq stmts = NULL;
8683           vec_def = gimple_convert (&stmts, step_vectype, vec_def);
8684           vec_def = gimple_build (&stmts,
8685                                   PLUS_EXPR, step_vectype, vec_def, vec_step);
8686           vec_def = gimple_convert (&stmts, vectype, vec_def);
8687
8688           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8689           new_stmt = SSA_NAME_DEF_STMT (vec_def);
8690           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
8691         }
8692     }
8693
8694   if (dump_enabled_p ())
8695     dump_printf_loc (MSG_NOTE, vect_location,
8696                      "transform induction: created def-use cycle: %G%G",
8697                      induction_phi, SSA_NAME_DEF_STMT (vec_def));
8698
8699   return true;
8700 }
8701
8702 /* Function vectorizable_live_operation.
8703
8704    STMT_INFO computes a value that is used outside the loop.  Check if
8705    it can be supported.  */
8706
8707 bool
8708 vectorizable_live_operation (vec_info *vinfo,
8709                              stmt_vec_info stmt_info,
8710                              gimple_stmt_iterator *gsi,
8711                              slp_tree slp_node, slp_instance slp_node_instance,
8712                              int slp_index, bool vec_stmt_p,
8713                              stmt_vector_for_cost *cost_vec)
8714 {
8715   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
8716   imm_use_iterator imm_iter;
8717   tree lhs, lhs_type, bitsize;
8718   tree vectype = (slp_node
8719                   ? SLP_TREE_VECTYPE (slp_node)
8720                   : STMT_VINFO_VECTYPE (stmt_info));
8721   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8722   int ncopies;
8723   gimple *use_stmt;
8724   auto_vec<tree> vec_oprnds;
8725   int vec_entry = 0;
8726   poly_uint64 vec_index = 0;
8727
8728   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
8729
8730   /* If a stmt of a reduction is live, vectorize it via
8731      vect_create_epilog_for_reduction.  vectorizable_reduction assessed
8732      validity so just trigger the transform here.  */
8733   if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
8734     {
8735       if (!vec_stmt_p)
8736         return true;
8737       if (slp_node)
8738         {
8739           /* For reduction chains the meta-info is attached to
8740              the group leader.  */
8741           if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
8742             stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
8743           /* For SLP reductions we vectorize the epilogue for
8744              all involved stmts together.  */
8745           else if (slp_index != 0)
8746             return true;
8747           else
8748             /* For SLP reductions the meta-info is attached to
8749                the representative.  */
8750             stmt_info = SLP_TREE_REPRESENTATIVE (slp_node);
8751         }
8752       stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8753       gcc_assert (reduc_info->is_reduc_info);
8754       if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
8755           || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
8756         return true;
8757       vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
8758                                         slp_node_instance);
8759       return true;
8760     }
8761
8762   /* If STMT is not relevant and it is a simple assignment and its inputs are
8763      invariant then it can remain in place, unvectorized.  The original last
8764      scalar value that it computes will be used.  */
8765   if (!STMT_VINFO_RELEVANT_P (stmt_info))
8766     {
8767       gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
8768       if (dump_enabled_p ())
8769         dump_printf_loc (MSG_NOTE, vect_location,
8770                          "statement is simple and uses invariant.  Leaving in "
8771                          "place.\n");
8772       return true;
8773     }
8774
8775   if (slp_node)
8776     ncopies = 1;
8777   else
8778     ncopies = vect_get_num_copies (loop_vinfo, vectype);
8779
8780   if (slp_node)
8781     {
8782       gcc_assert (slp_index >= 0);
8783
8784       /* Get the last occurrence of the scalar index from the concatenation of
8785          all the slp vectors. Calculate which slp vector it is and the index
8786          within.  */
8787       int num_scalar = SLP_TREE_LANES (slp_node);
8788       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8789       poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
8790
8791       /* Calculate which vector contains the result, and which lane of
8792          that vector we need.  */
8793       if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
8794         {
8795           if (dump_enabled_p ())
8796             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8797                              "Cannot determine which vector holds the"
8798                              " final result.\n");
8799           return false;
8800         }
8801     }
8802
8803   if (!vec_stmt_p)
8804     {
8805       /* No transformation required.  */
8806       if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
8807         {
8808           if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
8809                                                OPTIMIZE_FOR_SPEED))
8810             {
8811               if (dump_enabled_p ())
8812                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8813                                  "can't operate on partial vectors "
8814                                  "because the target doesn't support extract "
8815                                  "last reduction.\n");
8816               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8817             }
8818           else if (slp_node)
8819             {
8820               if (dump_enabled_p ())
8821                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8822                                  "can't operate on partial vectors "
8823                                  "because an SLP statement is live after "
8824                                  "the loop.\n");
8825               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8826             }
8827           else if (ncopies > 1)
8828             {
8829               if (dump_enabled_p ())
8830                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8831                                  "can't operate on partial vectors "
8832                                  "because ncopies is greater than 1.\n");
8833               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8834             }
8835           else
8836             {
8837               gcc_assert (ncopies == 1 && !slp_node);
8838               vect_record_loop_mask (loop_vinfo,
8839                                      &LOOP_VINFO_MASKS (loop_vinfo),
8840                                      1, vectype, NULL);
8841             }
8842         }
8843       /* ???  Enable for loop costing as well.  */
8844       if (!loop_vinfo)
8845         record_stmt_cost (cost_vec, 1, vec_to_scalar, stmt_info, NULL_TREE,
8846                           0, vect_epilogue);
8847       return true;
8848     }
8849
8850   /* Use the lhs of the original scalar statement.  */
8851   gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
8852   if (dump_enabled_p ())
8853     dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
8854                      "stmt %G", stmt);
8855
8856   lhs = gimple_get_lhs (stmt);
8857   lhs_type = TREE_TYPE (lhs);
8858
8859   bitsize = vector_element_bits_tree (vectype);
8860
8861   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
8862   tree vec_lhs, bitstart;
8863   gimple *vec_stmt;
8864   if (slp_node)
8865     {
8866       gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8867
8868       /* Get the correct slp vectorized stmt.  */
8869       vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry];
8870       vec_lhs = gimple_get_lhs (vec_stmt);
8871
8872       /* Get entry to use.  */
8873       bitstart = bitsize_int (vec_index);
8874       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
8875     }
8876   else
8877     {
8878       /* For multiple copies, get the last copy.  */
8879       vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info).last ();
8880       vec_lhs = gimple_get_lhs (vec_stmt);
8881
8882       /* Get the last lane in the vector.  */
8883       bitstart = int_const_binop (MULT_EXPR, bitsize, bitsize_int (nunits - 1));
8884     }
8885
8886   if (loop_vinfo)
8887     {
8888       /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
8889          requirement, insert one phi node for it.  It looks like:
8890            loop;
8891          BB:
8892            # lhs' = PHI <lhs>
8893          ==>
8894            loop;
8895          BB:
8896            # vec_lhs' = PHI <vec_lhs>
8897            new_tree = lane_extract <vec_lhs', ...>;
8898            lhs' = new_tree;  */
8899
8900       class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8901       basic_block exit_bb = single_exit (loop)->dest;
8902       gcc_assert (single_pred_p (exit_bb));
8903
8904       tree vec_lhs_phi = copy_ssa_name (vec_lhs);
8905       gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
8906       SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, vec_lhs);
8907
8908       gimple_seq stmts = NULL;
8909       tree new_tree;
8910       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8911         {
8912           /* Emit:
8913
8914                SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
8915
8916              where VEC_LHS is the vectorized live-out result and MASK is
8917              the loop mask for the final iteration.  */
8918           gcc_assert (ncopies == 1 && !slp_node);
8919           tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
8920           tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
8921                                           1, vectype, 0);
8922           tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
8923                                           mask, vec_lhs_phi);
8924
8925           /* Convert the extracted vector element to the scalar type.  */
8926           new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
8927         }
8928       else
8929         {
8930           tree bftype = TREE_TYPE (vectype);
8931           if (VECTOR_BOOLEAN_TYPE_P (vectype))
8932             bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8933           new_tree = build3 (BIT_FIELD_REF, bftype,
8934                              vec_lhs_phi, bitsize, bitstart);
8935           new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8936                                            &stmts, true, NULL_TREE);
8937         }
8938
8939       if (stmts)
8940         {
8941           gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb);
8942           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
8943
8944           /* Remove existing phi from lhs and create one copy from new_tree.  */
8945           tree lhs_phi = NULL_TREE;
8946           gimple_stmt_iterator gsi;
8947           for (gsi = gsi_start_phis (exit_bb);
8948                !gsi_end_p (gsi); gsi_next (&gsi))
8949             {
8950               gimple *phi = gsi_stmt (gsi);
8951               if ((gimple_phi_arg_def (phi, 0) == lhs))
8952                 {
8953                   remove_phi_node (&gsi, false);
8954                   lhs_phi = gimple_phi_result (phi);
8955                   gimple *copy = gimple_build_assign (lhs_phi, new_tree);
8956                   gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
8957                   break;
8958                 }
8959             }
8960         }
8961
8962       /* Replace use of lhs with newly computed result.  If the use stmt is a
8963          single arg PHI, just replace all uses of PHI result.  It's necessary
8964          because lcssa PHI defining lhs may be before newly inserted stmt.  */
8965       use_operand_p use_p;
8966       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8967         if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
8968             && !is_gimple_debug (use_stmt))
8969           {
8970             if (gimple_code (use_stmt) == GIMPLE_PHI
8971                 && gimple_phi_num_args (use_stmt) == 1)
8972               {
8973                 replace_uses_by (gimple_phi_result (use_stmt), new_tree);
8974               }
8975             else
8976               {
8977                 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8978                     SET_USE (use_p, new_tree);
8979               }
8980             update_stmt (use_stmt);
8981           }
8982     }
8983   else
8984     {
8985       /* For basic-block vectorization simply insert the lane-extraction.  */
8986       tree bftype = TREE_TYPE (vectype);
8987       if (VECTOR_BOOLEAN_TYPE_P (vectype))
8988         bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8989       tree new_tree = build3 (BIT_FIELD_REF, bftype,
8990                               vec_lhs, bitsize, bitstart);
8991       gimple_seq stmts = NULL;
8992       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8993                                        &stmts, true, NULL_TREE);
8994       if (TREE_CODE (new_tree) == SSA_NAME
8995           && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
8996         SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1;
8997       if (is_a <gphi *> (vec_stmt))
8998         {
8999           gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt));
9000           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9001         }
9002       else
9003         {
9004           gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
9005           gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
9006         }
9007
9008       /* Replace use of lhs with newly computed result.  If the use stmt is a
9009          single arg PHI, just replace all uses of PHI result.  It's necessary
9010          because lcssa PHI defining lhs may be before newly inserted stmt.  */
9011       use_operand_p use_p;
9012       stmt_vec_info use_stmt_info;
9013       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
9014         if (!is_gimple_debug (use_stmt)
9015             && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
9016                 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
9017           {
9018             /* ???  This can happen when the live lane ends up being
9019                used in a vector construction code-generated by an
9020                external SLP node (and code-generation for that already
9021                happened).  See gcc.dg/vect/bb-slp-47.c.
9022                Doing this is what would happen if that vector CTOR
9023                were not code-generated yet so it is not too bad.
9024                ???  In fact we'd likely want to avoid this situation
9025                in the first place.  */
9026             if (TREE_CODE (new_tree) == SSA_NAME
9027                 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
9028                 && gimple_code (use_stmt) != GIMPLE_PHI
9029                 && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
9030                                                 use_stmt))
9031               {
9032                 enum tree_code code = gimple_assign_rhs_code (use_stmt);
9033                 gcc_assert (code == CONSTRUCTOR
9034                             || code == VIEW_CONVERT_EXPR
9035                             || CONVERT_EXPR_CODE_P (code));
9036                 if (dump_enabled_p ())
9037                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9038                                    "Using original scalar computation for "
9039                                    "live lane because use preceeds vector "
9040                                    "def\n");
9041                 continue;
9042               }
9043             /* ???  It can also happen that we end up pulling a def into
9044                a loop where replacing out-of-loop uses would require
9045                a new LC SSA PHI node.  Retain the original scalar in
9046                those cases as well.  PR98064.  */
9047             if (TREE_CODE (new_tree) == SSA_NAME
9048                 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
9049                 && (gimple_bb (use_stmt)->loop_father
9050                     != gimple_bb (vec_stmt)->loop_father)
9051                 && !flow_loop_nested_p (gimple_bb (vec_stmt)->loop_father,
9052                                         gimple_bb (use_stmt)->loop_father))
9053               {
9054                 if (dump_enabled_p ())
9055                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9056                                    "Using original scalar computation for "
9057                                    "live lane because there is an out-of-loop "
9058                                    "definition for it\n");
9059                 continue;
9060               }
9061             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
9062               SET_USE (use_p, new_tree);
9063             update_stmt (use_stmt);
9064           }
9065     }
9066
9067   return true;
9068 }
9069
9070 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO.  */
9071
9072 static void
9073 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
9074 {
9075   ssa_op_iter op_iter;
9076   imm_use_iterator imm_iter;
9077   def_operand_p def_p;
9078   gimple *ustmt;
9079
9080   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
9081     {
9082       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
9083         {
9084           basic_block bb;
9085
9086           if (!is_gimple_debug (ustmt))
9087             continue;
9088
9089           bb = gimple_bb (ustmt);
9090
9091           if (!flow_bb_inside_loop_p (loop, bb))
9092             {
9093               if (gimple_debug_bind_p (ustmt))
9094                 {
9095                   if (dump_enabled_p ())
9096                     dump_printf_loc (MSG_NOTE, vect_location,
9097                                      "killing debug use\n");
9098
9099                   gimple_debug_bind_reset_value (ustmt);
9100                   update_stmt (ustmt);
9101                 }
9102               else
9103                 gcc_unreachable ();
9104             }
9105         }
9106     }
9107 }
9108
9109 /* Given loop represented by LOOP_VINFO, return true if computation of
9110    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
9111    otherwise.  */
9112
9113 static bool
9114 loop_niters_no_overflow (loop_vec_info loop_vinfo)
9115 {
9116   /* Constant case.  */
9117   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
9118     {
9119       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
9120       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
9121
9122       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
9123       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
9124       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
9125         return true;
9126     }
9127
9128   widest_int max;
9129   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9130   /* Check the upper bound of loop niters.  */
9131   if (get_max_loop_iterations (loop, &max))
9132     {
9133       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
9134       signop sgn = TYPE_SIGN (type);
9135       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
9136       if (max < type_max)
9137         return true;
9138     }
9139   return false;
9140 }
9141
9142 /* Return a mask type with half the number of elements as OLD_TYPE,
9143    given that it should have mode NEW_MODE.  */
9144
9145 tree
9146 vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
9147 {
9148   poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
9149   return build_truth_vector_type_for_mode (nunits, new_mode);
9150 }
9151
9152 /* Return a mask type with twice as many elements as OLD_TYPE,
9153    given that it should have mode NEW_MODE.  */
9154
9155 tree
9156 vect_double_mask_nunits (tree old_type, machine_mode new_mode)
9157 {
9158   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
9159   return build_truth_vector_type_for_mode (nunits, new_mode);
9160 }
9161
9162 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
9163    contain a sequence of NVECTORS masks that each control a vector of type
9164    VECTYPE.  If SCALAR_MASK is nonnull, the fully-masked loop would AND
9165    these vector masks with the vector version of SCALAR_MASK.  */
9166
9167 void
9168 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
9169                        unsigned int nvectors, tree vectype, tree scalar_mask)
9170 {
9171   gcc_assert (nvectors != 0);
9172   if (masks->length () < nvectors)
9173     masks->safe_grow_cleared (nvectors, true);
9174   rgroup_controls *rgm = &(*masks)[nvectors - 1];
9175   /* The number of scalars per iteration and the number of vectors are
9176      both compile-time constants.  */
9177   unsigned int nscalars_per_iter
9178     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
9179                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
9180
9181   if (scalar_mask)
9182     {
9183       scalar_cond_masked_key cond (scalar_mask, nvectors);
9184       loop_vinfo->scalar_cond_masked_set.add (cond);
9185     }
9186
9187   if (rgm->max_nscalars_per_iter < nscalars_per_iter)
9188     {
9189       rgm->max_nscalars_per_iter = nscalars_per_iter;
9190       rgm->type = truth_type_for (vectype);
9191       rgm->factor = 1;
9192     }
9193 }
9194
9195 /* Given a complete set of masks MASKS, extract mask number INDEX
9196    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
9197    where 0 <= INDEX < NVECTORS.  Insert any set-up statements before GSI.
9198
9199    See the comment above vec_loop_masks for more details about the mask
9200    arrangement.  */
9201
9202 tree
9203 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
9204                     unsigned int nvectors, tree vectype, unsigned int index)
9205 {
9206   rgroup_controls *rgm = &(*masks)[nvectors - 1];
9207   tree mask_type = rgm->type;
9208
9209   /* Populate the rgroup's mask array, if this is the first time we've
9210      used it.  */
9211   if (rgm->controls.is_empty ())
9212     {
9213       rgm->controls.safe_grow_cleared (nvectors, true);
9214       for (unsigned int i = 0; i < nvectors; ++i)
9215         {
9216           tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
9217           /* Provide a dummy definition until the real one is available.  */
9218           SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
9219           rgm->controls[i] = mask;
9220         }
9221     }
9222
9223   tree mask = rgm->controls[index];
9224   if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
9225                 TYPE_VECTOR_SUBPARTS (vectype)))
9226     {
9227       /* A loop mask for data type X can be reused for data type Y
9228          if X has N times more elements than Y and if Y's elements
9229          are N times bigger than X's.  In this case each sequence
9230          of N elements in the loop mask will be all-zero or all-one.
9231          We can then view-convert the mask so that each sequence of
9232          N elements is replaced by a single element.  */
9233       gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
9234                               TYPE_VECTOR_SUBPARTS (vectype)));
9235       gimple_seq seq = NULL;
9236       mask_type = truth_type_for (vectype);
9237       mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
9238       if (seq)
9239         gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
9240     }
9241   return mask;
9242 }
9243
9244 /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
9245    lengths for controlling an operation on VECTYPE.  The operation splits
9246    each element of VECTYPE into FACTOR separate subelements, measuring the
9247    length as a number of these subelements.  */
9248
9249 void
9250 vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
9251                       unsigned int nvectors, tree vectype, unsigned int factor)
9252 {
9253   gcc_assert (nvectors != 0);
9254   if (lens->length () < nvectors)
9255     lens->safe_grow_cleared (nvectors, true);
9256   rgroup_controls *rgl = &(*lens)[nvectors - 1];
9257
9258   /* The number of scalars per iteration, scalar occupied bytes and
9259      the number of vectors are both compile-time constants.  */
9260   unsigned int nscalars_per_iter
9261     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
9262                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
9263
9264   if (rgl->max_nscalars_per_iter < nscalars_per_iter)
9265     {
9266       /* For now, we only support cases in which all loads and stores fall back
9267          to VnQI or none do.  */
9268       gcc_assert (!rgl->max_nscalars_per_iter
9269                   || (rgl->factor == 1 && factor == 1)
9270                   || (rgl->max_nscalars_per_iter * rgl->factor
9271                       == nscalars_per_iter * factor));
9272       rgl->max_nscalars_per_iter = nscalars_per_iter;
9273       rgl->type = vectype;
9274       rgl->factor = factor;
9275     }
9276 }
9277
9278 /* Given a complete set of length LENS, extract length number INDEX for an
9279    rgroup that operates on NVECTORS vectors, where 0 <= INDEX < NVECTORS.  */
9280
9281 tree
9282 vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
9283                    unsigned int nvectors, unsigned int index)
9284 {
9285   rgroup_controls *rgl = &(*lens)[nvectors - 1];
9286   bool use_bias_adjusted_len =
9287     LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
9288
9289   /* Populate the rgroup's len array, if this is the first time we've
9290      used it.  */
9291   if (rgl->controls.is_empty ())
9292     {
9293       rgl->controls.safe_grow_cleared (nvectors, true);
9294       for (unsigned int i = 0; i < nvectors; ++i)
9295         {
9296           tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
9297           gcc_assert (len_type != NULL_TREE);
9298
9299           tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
9300
9301           /* Provide a dummy definition until the real one is available.  */
9302           SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
9303           rgl->controls[i] = len;
9304
9305           if (use_bias_adjusted_len)
9306             {
9307               gcc_assert (i == 0);
9308               tree adjusted_len =
9309                 make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
9310               SSA_NAME_DEF_STMT (adjusted_len) = gimple_build_nop ();
9311               rgl->bias_adjusted_ctrl = adjusted_len;
9312             }
9313         }
9314     }
9315
9316   if (use_bias_adjusted_len)
9317     return rgl->bias_adjusted_ctrl;
9318   else
9319     return rgl->controls[index];
9320 }
9321
9322 /* Scale profiling counters by estimation for LOOP which is vectorized
9323    by factor VF.  */
9324
9325 static void
9326 scale_profile_for_vect_loop (class loop *loop, unsigned vf)
9327 {
9328   edge preheader = loop_preheader_edge (loop);
9329   /* Reduce loop iterations by the vectorization factor.  */
9330   gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
9331   profile_count freq_h = loop->header->count, freq_e = preheader->count ();
9332
9333   if (freq_h.nonzero_p ())
9334     {
9335       profile_probability p;
9336
9337       /* Avoid dropping loop body profile counter to 0 because of zero count
9338          in loop's preheader.  */
9339       if (!(freq_e == profile_count::zero ()))
9340         freq_e = freq_e.force_nonzero ();
9341       p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
9342       scale_loop_frequencies (loop, p);
9343     }
9344
9345   edge exit_e = single_exit (loop);
9346   exit_e->probability = profile_probability::always ()
9347                                  .apply_scale (1, new_est_niter + 1);
9348
9349   edge exit_l = single_pred_edge (loop->latch);
9350   profile_probability prob = exit_l->probability;
9351   exit_l->probability = exit_e->probability.invert ();
9352   if (prob.initialized_p () && exit_l->probability.initialized_p ())
9353     scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
9354 }
9355
9356 /* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
9357    latch edge values originally defined by it.  */
9358
9359 static void
9360 maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo,
9361                                      stmt_vec_info def_stmt_info)
9362 {
9363   tree def = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
9364   if (!def || TREE_CODE (def) != SSA_NAME)
9365     return;
9366   stmt_vec_info phi_info;
9367   imm_use_iterator iter;
9368   use_operand_p use_p;
9369   FOR_EACH_IMM_USE_FAST (use_p, iter, def)
9370     if (gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p)))
9371       if (gimple_bb (phi)->loop_father->header == gimple_bb (phi)
9372           && (phi_info = loop_vinfo->lookup_stmt (phi))
9373           && STMT_VINFO_RELEVANT_P (phi_info)
9374           && VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info))
9375           && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION
9376           && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION)
9377         {
9378           loop_p loop = gimple_bb (phi)->loop_father;
9379           edge e = loop_latch_edge (loop);
9380           if (PHI_ARG_DEF_FROM_EDGE (phi, e) == def)
9381             {
9382               vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
9383               vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
9384               gcc_assert (phi_defs.length () == latch_defs.length ());
9385               for (unsigned i = 0; i < phi_defs.length (); ++i)
9386                 add_phi_arg (as_a <gphi *> (phi_defs[i]),
9387                              gimple_get_lhs (latch_defs[i]), e,
9388                              gimple_phi_arg_location (phi, e->dest_idx));
9389             }
9390         }
9391 }
9392
9393 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
9394    When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
9395    stmt_vec_info.  */
9396
9397 static bool
9398 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
9399                           gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
9400 {
9401   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9402   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9403
9404   if (dump_enabled_p ())
9405     dump_printf_loc (MSG_NOTE, vect_location,
9406                      "------>vectorizing statement: %G", stmt_info->stmt);
9407
9408   if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
9409     vect_loop_kill_debug_uses (loop, stmt_info);
9410
9411   if (!STMT_VINFO_RELEVANT_P (stmt_info)
9412       && !STMT_VINFO_LIVE_P (stmt_info))
9413     return false;
9414
9415   if (STMT_VINFO_VECTYPE (stmt_info))
9416     {
9417       poly_uint64 nunits
9418         = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
9419       if (!STMT_SLP_TYPE (stmt_info)
9420           && maybe_ne (nunits, vf)
9421           && dump_enabled_p ())
9422         /* For SLP VF is set according to unrolling factor, and not
9423            to vector size, hence for SLP this print is not valid.  */
9424         dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
9425     }
9426
9427   /* Pure SLP statements have already been vectorized.  We still need
9428      to apply loop vectorization to hybrid SLP statements.  */
9429   if (PURE_SLP_STMT (stmt_info))
9430     return false;
9431
9432   if (dump_enabled_p ())
9433     dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
9434
9435   if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL))
9436     *seen_store = stmt_info;
9437
9438   return true;
9439 }
9440
9441 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
9442    in the hash_map with its corresponding values.  */
9443
9444 static tree
9445 find_in_mapping (tree t, void *context)
9446 {
9447   hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
9448
9449   tree *value = mapping->get (t);
9450   return value ? *value : t;
9451 }
9452
9453 /* Update EPILOGUE's loop_vec_info.  EPILOGUE was constructed as a copy of the
9454    original loop that has now been vectorized.
9455
9456    The inits of the data_references need to be advanced with the number of
9457    iterations of the main loop.  This has been computed in vect_do_peeling and
9458    is stored in parameter ADVANCE.  We first restore the data_references
9459    initial offset with the values recored in ORIG_DRS_INIT.
9460
9461    Since the loop_vec_info of this EPILOGUE was constructed for the original
9462    loop, its stmt_vec_infos all point to the original statements.  These need
9463    to be updated to point to their corresponding copies as well as the SSA_NAMES
9464    in their PATTERN_DEF_SEQs and RELATED_STMTs.
9465
9466    The data_reference's connections also need to be updated.  Their
9467    corresponding dr_vec_info need to be reconnected to the EPILOGUE's
9468    stmt_vec_infos, their statements need to point to their corresponding copy,
9469    if they are gather loads or scatter stores then their reference needs to be
9470    updated to point to its corresponding copy and finally we set
9471    'base_misaligned' to false as we have already peeled for alignment in the
9472    prologue of the main loop.  */
9473
9474 static void
9475 update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
9476 {
9477   loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
9478   auto_vec<gimple *> stmt_worklist;
9479   hash_map<tree,tree> mapping;
9480   gimple *orig_stmt, *new_stmt;
9481   gimple_stmt_iterator epilogue_gsi;
9482   gphi_iterator epilogue_phi_gsi;
9483   stmt_vec_info stmt_vinfo = NULL, related_vinfo;
9484   basic_block *epilogue_bbs = get_loop_body (epilogue);
9485   unsigned i;
9486
9487   free (LOOP_VINFO_BBS (epilogue_vinfo));
9488   LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
9489
9490   /* Advance data_reference's with the number of iterations of the previous
9491      loop and its prologue.  */
9492   vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
9493
9494
9495   /* The EPILOGUE loop is a copy of the original loop so they share the same
9496      gimple UIDs.  In this loop we update the loop_vec_info of the EPILOGUE to
9497      point to the copied statements.  We also create a mapping of all LHS' in
9498      the original loop and all the LHS' in the EPILOGUE and create worklists to
9499      update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs.  */
9500   for (unsigned i = 0; i < epilogue->num_nodes; ++i)
9501     {
9502       for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
9503            !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
9504         {
9505           new_stmt = epilogue_phi_gsi.phi ();
9506
9507           gcc_assert (gimple_uid (new_stmt) > 0);
9508           stmt_vinfo
9509             = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
9510
9511           orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
9512           STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
9513
9514           mapping.put (gimple_phi_result (orig_stmt),
9515                        gimple_phi_result (new_stmt));
9516           /* PHI nodes can not have patterns or related statements.  */
9517           gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
9518                       && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
9519         }
9520
9521       for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
9522            !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
9523         {
9524           new_stmt = gsi_stmt (epilogue_gsi);
9525           if (is_gimple_debug (new_stmt))
9526             continue;
9527
9528           gcc_assert (gimple_uid (new_stmt) > 0);
9529           stmt_vinfo
9530             = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
9531
9532           orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
9533           STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
9534
9535           if (tree old_lhs = gimple_get_lhs (orig_stmt))
9536             mapping.put (old_lhs, gimple_get_lhs (new_stmt));
9537
9538           if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
9539             {
9540               gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
9541               for (gimple_stmt_iterator gsi = gsi_start (seq);
9542                    !gsi_end_p (gsi); gsi_next (&gsi))
9543                 stmt_worklist.safe_push (gsi_stmt (gsi));
9544             }
9545
9546           related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
9547           if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
9548             {
9549               gimple *stmt = STMT_VINFO_STMT (related_vinfo);
9550               stmt_worklist.safe_push (stmt);
9551               /* Set BB such that the assert in
9552                 'get_initial_def_for_reduction' is able to determine that
9553                 the BB of the related stmt is inside this loop.  */
9554               gimple_set_bb (stmt,
9555                              gimple_bb (new_stmt));
9556               related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
9557               gcc_assert (related_vinfo == NULL
9558                           || related_vinfo == stmt_vinfo);
9559             }
9560         }
9561     }
9562
9563   /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
9564      using the original main loop and thus need to be updated to refer to the
9565      cloned variables used in the epilogue.  */
9566   for (unsigned i = 0; i < stmt_worklist.length (); ++i)
9567     {
9568       gimple *stmt = stmt_worklist[i];
9569       tree *new_op;
9570
9571       for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
9572         {
9573           tree op = gimple_op (stmt, j);
9574           if ((new_op = mapping.get(op)))
9575             gimple_set_op (stmt, j, *new_op);
9576           else
9577             {
9578               /* PR92429: The last argument of simplify_replace_tree disables
9579                  folding when replacing arguments.  This is required as
9580                  otherwise you might end up with different statements than the
9581                  ones analyzed in vect_loop_analyze, leading to different
9582                  vectorization.  */
9583               op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
9584                                           &find_in_mapping, &mapping, false);
9585               gimple_set_op (stmt, j, op);
9586             }
9587         }
9588     }
9589
9590   struct data_reference *dr;
9591   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
9592   FOR_EACH_VEC_ELT (datarefs, i, dr)
9593     {
9594       orig_stmt = DR_STMT (dr);
9595       gcc_assert (gimple_uid (orig_stmt) > 0);
9596       stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
9597       /* Data references for gather loads and scatter stores do not use the
9598          updated offset we set using ADVANCE.  Instead we have to make sure the
9599          reference in the data references point to the corresponding copy of
9600          the original in the epilogue.  */
9601       if (STMT_VINFO_MEMORY_ACCESS_TYPE (vect_stmt_to_vectorize (stmt_vinfo))
9602           == VMAT_GATHER_SCATTER)
9603         {
9604           DR_REF (dr)
9605             = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
9606                                      &find_in_mapping, &mapping);
9607           DR_BASE_ADDRESS (dr)
9608             = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
9609                                      &find_in_mapping, &mapping);
9610         }
9611       DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
9612       stmt_vinfo->dr_aux.stmt = stmt_vinfo;
9613       /* The vector size of the epilogue is smaller than that of the main loop
9614          so the alignment is either the same or lower. This means the dr will
9615          thus by definition be aligned.  */
9616       STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false;
9617     }
9618
9619   epilogue_vinfo->shared->datarefs_copy.release ();
9620   epilogue_vinfo->shared->save_datarefs ();
9621 }
9622
9623 /* Function vect_transform_loop.
9624
9625    The analysis phase has determined that the loop is vectorizable.
9626    Vectorize the loop - created vectorized stmts to replace the scalar
9627    stmts in the loop, and update the loop exit condition.
9628    Returns scalar epilogue loop if any.  */
9629
9630 class loop *
9631 vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
9632 {
9633   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9634   class loop *epilogue = NULL;
9635   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
9636   int nbbs = loop->num_nodes;
9637   int i;
9638   tree niters_vector = NULL_TREE;
9639   tree step_vector = NULL_TREE;
9640   tree niters_vector_mult_vf = NULL_TREE;
9641   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9642   unsigned int lowest_vf = constant_lower_bound (vf);
9643   gimple *stmt;
9644   bool check_profitability = false;
9645   unsigned int th;
9646
9647   DUMP_VECT_SCOPE ("vec_transform_loop");
9648
9649   loop_vinfo->shared->check_datarefs ();
9650
9651   /* Use the more conservative vectorization threshold.  If the number
9652      of iterations is constant assume the cost check has been performed
9653      by our caller.  If the threshold makes all loops profitable that
9654      run at least the (estimated) vectorization factor number of times
9655      checking is pointless, too.  */
9656   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
9657   if (vect_apply_runtime_profitability_check_p (loop_vinfo))
9658     {
9659       if (dump_enabled_p ())
9660         dump_printf_loc (MSG_NOTE, vect_location,
9661                          "Profitability threshold is %d loop iterations.\n",
9662                          th);
9663       check_profitability = true;
9664     }
9665
9666   /* Make sure there exists a single-predecessor exit bb.  Do this before
9667      versioning.   */
9668   edge e = single_exit (loop);
9669   if (! single_pred_p (e->dest))
9670     {
9671       split_loop_exit_edge (e, true);
9672       if (dump_enabled_p ())
9673         dump_printf (MSG_NOTE, "split exit edge\n");
9674     }
9675
9676   /* Version the loop first, if required, so the profitability check
9677      comes first.  */
9678
9679   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
9680     {
9681       class loop *sloop
9682         = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
9683       sloop->force_vectorize = false;
9684       check_profitability = false;
9685     }
9686
9687   /* Make sure there exists a single-predecessor exit bb also on the
9688      scalar loop copy.  Do this after versioning but before peeling
9689      so CFG structure is fine for both scalar and if-converted loop
9690      to make slpeel_duplicate_current_defs_from_edges face matched
9691      loop closed PHI nodes on the exit.  */
9692   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
9693     {
9694       e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
9695       if (! single_pred_p (e->dest))
9696         {
9697           split_loop_exit_edge (e, true);
9698           if (dump_enabled_p ())
9699             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
9700         }
9701     }
9702
9703   tree niters = vect_build_loop_niters (loop_vinfo);
9704   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
9705   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
9706   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
9707   tree advance;
9708   drs_init_vec orig_drs_init;
9709
9710   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
9711                               &step_vector, &niters_vector_mult_vf, th,
9712                               check_profitability, niters_no_overflow,
9713                               &advance);
9714
9715   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
9716       && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
9717     scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
9718                             LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
9719
9720   if (niters_vector == NULL_TREE)
9721     {
9722       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
9723           && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
9724           && known_eq (lowest_vf, vf))
9725         {
9726           niters_vector
9727             = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
9728                              LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
9729           step_vector = build_one_cst (TREE_TYPE (niters));
9730         }
9731       else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
9732         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
9733                                      &step_vector, niters_no_overflow);
9734       else
9735         /* vect_do_peeling subtracted the number of peeled prologue
9736            iterations from LOOP_VINFO_NITERS.  */
9737         vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
9738                                      &niters_vector, &step_vector,
9739                                      niters_no_overflow);
9740     }
9741
9742   /* 1) Make sure the loop header has exactly two entries
9743      2) Make sure we have a preheader basic block.  */
9744
9745   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
9746
9747   split_edge (loop_preheader_edge (loop));
9748
9749   if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
9750     /* This will deal with any possible peeling.  */
9751     vect_prepare_for_masked_peels (loop_vinfo);
9752
9753   /* Schedule the SLP instances first, then handle loop vectorization
9754      below.  */
9755   if (!loop_vinfo->slp_instances.is_empty ())
9756     {
9757       DUMP_VECT_SCOPE ("scheduling SLP instances");
9758       vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
9759     }
9760
9761   /* FORNOW: the vectorizer supports only loops which body consist
9762      of one basic block (header + empty latch). When the vectorizer will
9763      support more involved loop forms, the order by which the BBs are
9764      traversed need to be reconsidered.  */
9765
9766   for (i = 0; i < nbbs; i++)
9767     {
9768       basic_block bb = bbs[i];
9769       stmt_vec_info stmt_info;
9770
9771       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
9772            gsi_next (&si))
9773         {
9774           gphi *phi = si.phi ();
9775           if (dump_enabled_p ())
9776             dump_printf_loc (MSG_NOTE, vect_location,
9777                              "------>vectorizing phi: %G", phi);
9778           stmt_info = loop_vinfo->lookup_stmt (phi);
9779           if (!stmt_info)
9780             continue;
9781
9782           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
9783             vect_loop_kill_debug_uses (loop, stmt_info);
9784
9785           if (!STMT_VINFO_RELEVANT_P (stmt_info)
9786               && !STMT_VINFO_LIVE_P (stmt_info))
9787             continue;
9788
9789           if (STMT_VINFO_VECTYPE (stmt_info)
9790               && (maybe_ne
9791                   (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
9792               && dump_enabled_p ())
9793             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
9794
9795           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
9796                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
9797                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
9798                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
9799                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
9800               && ! PURE_SLP_STMT (stmt_info))
9801             {
9802               if (dump_enabled_p ())
9803                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
9804               vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL);
9805             }
9806         }
9807
9808       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
9809            gsi_next (&si))
9810         {
9811           gphi *phi = si.phi ();
9812           stmt_info = loop_vinfo->lookup_stmt (phi);
9813           if (!stmt_info)
9814             continue;
9815
9816           if (!STMT_VINFO_RELEVANT_P (stmt_info)
9817               && !STMT_VINFO_LIVE_P (stmt_info))
9818             continue;
9819
9820           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
9821                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
9822                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
9823                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
9824                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
9825               && ! PURE_SLP_STMT (stmt_info))
9826             maybe_set_vectorized_backedge_value (loop_vinfo, stmt_info);
9827         }
9828
9829       for (gimple_stmt_iterator si = gsi_start_bb (bb);
9830            !gsi_end_p (si);)
9831         {
9832           stmt = gsi_stmt (si);
9833           /* During vectorization remove existing clobber stmts.  */
9834           if (gimple_clobber_p (stmt))
9835             {
9836               unlink_stmt_vdef (stmt);
9837               gsi_remove (&si, true);
9838               release_defs (stmt);
9839             }
9840           else
9841             {
9842               /* Ignore vector stmts created in the outer loop.  */
9843               stmt_info = loop_vinfo->lookup_stmt (stmt);
9844
9845               /* vector stmts created in the outer-loop during vectorization of
9846                  stmts in an inner-loop may not have a stmt_info, and do not
9847                  need to be vectorized.  */
9848               stmt_vec_info seen_store = NULL;
9849               if (stmt_info)
9850                 {
9851                   if (STMT_VINFO_IN_PATTERN_P (stmt_info))
9852                     {
9853                       gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
9854                       for (gimple_stmt_iterator subsi = gsi_start (def_seq);
9855                            !gsi_end_p (subsi); gsi_next (&subsi))
9856                         {
9857                           stmt_vec_info pat_stmt_info
9858                             = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
9859                           vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
9860                                                     &si, &seen_store);
9861                         }
9862                       stmt_vec_info pat_stmt_info
9863                         = STMT_VINFO_RELATED_STMT (stmt_info);
9864                       if (vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
9865                                                     &si, &seen_store))
9866                         maybe_set_vectorized_backedge_value (loop_vinfo,
9867                                                              pat_stmt_info);
9868                     }
9869                   else
9870                     {
9871                       if (vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
9872                                                     &seen_store))
9873                         maybe_set_vectorized_backedge_value (loop_vinfo,
9874                                                              stmt_info);
9875                     }
9876                 }
9877               gsi_next (&si);
9878               if (seen_store)
9879                 {
9880                   if (STMT_VINFO_GROUPED_ACCESS (seen_store))
9881                     /* Interleaving.  If IS_STORE is TRUE, the
9882                        vectorization of the interleaving chain was
9883                        completed - free all the stores in the chain.  */
9884                     vect_remove_stores (loop_vinfo,
9885                                         DR_GROUP_FIRST_ELEMENT (seen_store));
9886                   else
9887                     /* Free the attached stmt_vec_info and remove the stmt.  */
9888                     loop_vinfo->remove_stmt (stmt_info);
9889                 }
9890             }
9891         }
9892
9893       /* Stub out scalar statements that must not survive vectorization.
9894          Doing this here helps with grouped statements, or statements that
9895          are involved in patterns.  */
9896       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
9897            !gsi_end_p (gsi); gsi_next (&gsi))
9898         {
9899           gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
9900           if (!call || !gimple_call_internal_p (call))
9901             continue;
9902           internal_fn ifn = gimple_call_internal_fn (call);
9903           if (ifn == IFN_MASK_LOAD)
9904             {
9905               tree lhs = gimple_get_lhs (call);
9906               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9907                 {
9908                   tree zero = build_zero_cst (TREE_TYPE (lhs));
9909                   gimple *new_stmt = gimple_build_assign (lhs, zero);
9910                   gsi_replace (&gsi, new_stmt, true);
9911                 }
9912             }
9913           else if (conditional_internal_fn_code (ifn) != ERROR_MARK)
9914             {
9915               tree lhs = gimple_get_lhs (call);
9916               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9917                 {
9918                   tree else_arg
9919                     = gimple_call_arg (call, gimple_call_num_args (call) - 1);
9920                   gimple *new_stmt = gimple_build_assign (lhs, else_arg);
9921                   gsi_replace (&gsi, new_stmt, true);
9922                 }
9923             }
9924         }
9925     }                           /* BBs in loop */
9926
9927   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
9928      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
9929   if (integer_onep (step_vector))
9930     niters_no_overflow = true;
9931   vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
9932                            niters_vector_mult_vf, !niters_no_overflow);
9933
9934   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
9935   scale_profile_for_vect_loop (loop, assumed_vf);
9936
9937   /* True if the final iteration might not handle a full vector's
9938      worth of scalar iterations.  */
9939   bool final_iter_may_be_partial
9940     = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo);
9941   /* The minimum number of iterations performed by the epilogue.  This
9942      is 1 when peeling for gaps because we always need a final scalar
9943      iteration.  */
9944   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
9945   /* +1 to convert latch counts to loop iteration counts,
9946      -min_epilogue_iters to remove iterations that cannot be performed
9947        by the vector code.  */
9948   int bias_for_lowest = 1 - min_epilogue_iters;
9949   int bias_for_assumed = bias_for_lowest;
9950   int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
9951   if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
9952     {
9953       /* When the amount of peeling is known at compile time, the first
9954          iteration will have exactly alignment_npeels active elements.
9955          In the worst case it will have at least one.  */
9956       int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
9957       bias_for_lowest += lowest_vf - min_first_active;
9958       bias_for_assumed += assumed_vf - min_first_active;
9959     }
9960   /* In these calculations the "- 1" converts loop iteration counts
9961      back to latch counts.  */
9962   if (loop->any_upper_bound)
9963     {
9964       loop_vec_info main_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
9965       loop->nb_iterations_upper_bound
9966         = (final_iter_may_be_partial
9967            ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
9968                             lowest_vf) - 1
9969            : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
9970                              lowest_vf) - 1);
9971       if (main_vinfo)
9972         {
9973           unsigned int bound;
9974           poly_uint64 main_iters
9975             = upper_bound (LOOP_VINFO_VECT_FACTOR (main_vinfo),
9976                            LOOP_VINFO_COST_MODEL_THRESHOLD (main_vinfo));
9977           main_iters
9978             = upper_bound (main_iters,
9979                            LOOP_VINFO_VERSIONING_THRESHOLD (main_vinfo));
9980           if (can_div_away_from_zero_p (main_iters,
9981                                         LOOP_VINFO_VECT_FACTOR (loop_vinfo),
9982                                         &bound))
9983             loop->nb_iterations_upper_bound
9984               = wi::umin ((widest_int) (bound - 1),
9985                           loop->nb_iterations_upper_bound);
9986       }
9987   }
9988   if (loop->any_likely_upper_bound)
9989     loop->nb_iterations_likely_upper_bound
9990       = (final_iter_may_be_partial
9991          ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
9992                           + bias_for_lowest, lowest_vf) - 1
9993          : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
9994                            + bias_for_lowest, lowest_vf) - 1);
9995   if (loop->any_estimate)
9996     loop->nb_iterations_estimate
9997       = (final_iter_may_be_partial
9998          ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
9999                           assumed_vf) - 1
10000          : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
10001                            assumed_vf) - 1);
10002
10003   if (dump_enabled_p ())
10004     {
10005       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
10006         {
10007           dump_printf_loc (MSG_NOTE, vect_location,
10008                            "LOOP VECTORIZED\n");
10009           if (loop->inner)
10010             dump_printf_loc (MSG_NOTE, vect_location,
10011                              "OUTER LOOP VECTORIZED\n");
10012           dump_printf (MSG_NOTE, "\n");
10013         }
10014       else
10015         dump_printf_loc (MSG_NOTE, vect_location,
10016                          "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
10017                          GET_MODE_NAME (loop_vinfo->vector_mode));
10018     }
10019
10020   /* Loops vectorized with a variable factor won't benefit from
10021      unrolling/peeling.  */
10022   if (!vf.is_constant ())
10023     {
10024       loop->unroll = 1;
10025       if (dump_enabled_p ())
10026         dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
10027                          " variable-length vectorization factor\n");
10028     }
10029   /* Free SLP instances here because otherwise stmt reference counting
10030      won't work.  */
10031   slp_instance instance;
10032   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
10033     vect_free_slp_instance (instance);
10034   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
10035   /* Clear-up safelen field since its value is invalid after vectorization
10036      since vectorized loop can have loop-carried dependencies.  */
10037   loop->safelen = 0;
10038
10039   if (epilogue)
10040     {
10041       update_epilogue_loop_vinfo (epilogue, advance);
10042
10043       epilogue->simduid = loop->simduid;
10044       epilogue->force_vectorize = loop->force_vectorize;
10045       epilogue->dont_vectorize = false;
10046     }
10047
10048   return epilogue;
10049 }
10050
10051 /* The code below is trying to perform simple optimization - revert
10052    if-conversion for masked stores, i.e. if the mask of a store is zero
10053    do not perform it and all stored value producers also if possible.
10054    For example,
10055      for (i=0; i<n; i++)
10056        if (c[i])
10057         {
10058           p1[i] += 1;
10059           p2[i] = p3[i] +2;
10060         }
10061    this transformation will produce the following semi-hammock:
10062
10063    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
10064      {
10065        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
10066        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
10067        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
10068        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
10069        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
10070        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
10071      }
10072 */
10073
10074 void
10075 optimize_mask_stores (class loop *loop)
10076 {
10077   basic_block *bbs = get_loop_body (loop);
10078   unsigned nbbs = loop->num_nodes;
10079   unsigned i;
10080   basic_block bb;
10081   class loop *bb_loop;
10082   gimple_stmt_iterator gsi;
10083   gimple *stmt;
10084   auto_vec<gimple *> worklist;
10085   auto_purge_vect_location sentinel;
10086
10087   vect_location = find_loop_location (loop);
10088   /* Pick up all masked stores in loop if any.  */
10089   for (i = 0; i < nbbs; i++)
10090     {
10091       bb = bbs[i];
10092       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
10093            gsi_next (&gsi))
10094         {
10095           stmt = gsi_stmt (gsi);
10096           if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
10097             worklist.safe_push (stmt);
10098         }
10099     }
10100
10101   free (bbs);
10102   if (worklist.is_empty ())
10103     return;
10104
10105   /* Loop has masked stores.  */
10106   while (!worklist.is_empty ())
10107     {
10108       gimple *last, *last_store;
10109       edge e, efalse;
10110       tree mask;
10111       basic_block store_bb, join_bb;
10112       gimple_stmt_iterator gsi_to;
10113       tree vdef, new_vdef;
10114       gphi *phi;
10115       tree vectype;
10116       tree zero;
10117
10118       last = worklist.pop ();
10119       mask = gimple_call_arg (last, 2);
10120       bb = gimple_bb (last);
10121       /* Create then_bb and if-then structure in CFG, then_bb belongs to
10122          the same loop as if_bb.  It could be different to LOOP when two
10123          level loop-nest is vectorized and mask_store belongs to the inner
10124          one.  */
10125       e = split_block (bb, last);
10126       bb_loop = bb->loop_father;
10127       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
10128       join_bb = e->dest;
10129       store_bb = create_empty_bb (bb);
10130       add_bb_to_loop (store_bb, bb_loop);
10131       e->flags = EDGE_TRUE_VALUE;
10132       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
10133       /* Put STORE_BB to likely part.  */
10134       efalse->probability = profile_probability::unlikely ();
10135       store_bb->count = efalse->count ();
10136       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
10137       if (dom_info_available_p (CDI_DOMINATORS))
10138         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
10139       if (dump_enabled_p ())
10140         dump_printf_loc (MSG_NOTE, vect_location,
10141                          "Create new block %d to sink mask stores.",
10142                          store_bb->index);
10143       /* Create vector comparison with boolean result.  */
10144       vectype = TREE_TYPE (mask);
10145       zero = build_zero_cst (vectype);
10146       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
10147       gsi = gsi_last_bb (bb);
10148       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
10149       /* Create new PHI node for vdef of the last masked store:
10150          .MEM_2 = VDEF <.MEM_1>
10151          will be converted to
10152          .MEM.3 = VDEF <.MEM_1>
10153          and new PHI node will be created in join bb
10154          .MEM_2 = PHI <.MEM_1, .MEM_3>
10155       */
10156       vdef = gimple_vdef (last);
10157       new_vdef = make_ssa_name (gimple_vop (cfun), last);
10158       gimple_set_vdef (last, new_vdef);
10159       phi = create_phi_node (vdef, join_bb);
10160       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
10161
10162       /* Put all masked stores with the same mask to STORE_BB if possible.  */
10163       while (true)
10164         {
10165           gimple_stmt_iterator gsi_from;
10166           gimple *stmt1 = NULL;
10167
10168           /* Move masked store to STORE_BB.  */
10169           last_store = last;
10170           gsi = gsi_for_stmt (last);
10171           gsi_from = gsi;
10172           /* Shift GSI to the previous stmt for further traversal.  */
10173           gsi_prev (&gsi);
10174           gsi_to = gsi_start_bb (store_bb);
10175           gsi_move_before (&gsi_from, &gsi_to);
10176           /* Setup GSI_TO to the non-empty block start.  */
10177           gsi_to = gsi_start_bb (store_bb);
10178           if (dump_enabled_p ())
10179             dump_printf_loc (MSG_NOTE, vect_location,
10180                              "Move stmt to created bb\n%G", last);
10181           /* Move all stored value producers if possible.  */
10182           while (!gsi_end_p (gsi))
10183             {
10184               tree lhs;
10185               imm_use_iterator imm_iter;
10186               use_operand_p use_p;
10187               bool res;
10188
10189               /* Skip debug statements.  */
10190               if (is_gimple_debug (gsi_stmt (gsi)))
10191                 {
10192                   gsi_prev (&gsi);
10193                   continue;
10194                 }
10195               stmt1 = gsi_stmt (gsi);
10196               /* Do not consider statements writing to memory or having
10197                  volatile operand.  */
10198               if (gimple_vdef (stmt1)
10199                   || gimple_has_volatile_ops (stmt1))
10200                 break;
10201               gsi_from = gsi;
10202               gsi_prev (&gsi);
10203               lhs = gimple_get_lhs (stmt1);
10204               if (!lhs)
10205                 break;
10206
10207               /* LHS of vectorized stmt must be SSA_NAME.  */
10208               if (TREE_CODE (lhs) != SSA_NAME)
10209                 break;
10210
10211               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
10212                 {
10213                   /* Remove dead scalar statement.  */
10214                   if (has_zero_uses (lhs))
10215                     {
10216                       gsi_remove (&gsi_from, true);
10217                       continue;
10218                     }
10219                 }
10220
10221               /* Check that LHS does not have uses outside of STORE_BB.  */
10222               res = true;
10223               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
10224                 {
10225                   gimple *use_stmt;
10226                   use_stmt = USE_STMT (use_p);
10227                   if (is_gimple_debug (use_stmt))
10228                     continue;
10229                   if (gimple_bb (use_stmt) != store_bb)
10230                     {
10231                       res = false;
10232                       break;
10233                     }
10234                 }
10235               if (!res)
10236                 break;
10237
10238               if (gimple_vuse (stmt1)
10239                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
10240                 break;
10241
10242               /* Can move STMT1 to STORE_BB.  */
10243               if (dump_enabled_p ())
10244                 dump_printf_loc (MSG_NOTE, vect_location,
10245                                  "Move stmt to created bb\n%G", stmt1);
10246               gsi_move_before (&gsi_from, &gsi_to);
10247               /* Shift GSI_TO for further insertion.  */
10248               gsi_prev (&gsi_to);
10249             }
10250           /* Put other masked stores with the same mask to STORE_BB.  */
10251           if (worklist.is_empty ()
10252               || gimple_call_arg (worklist.last (), 2) != mask
10253               || worklist.last () != stmt1)
10254             break;
10255           last = worklist.pop ();
10256         }
10257       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
10258     }
10259 }
10260
10261 /* Decide whether it is possible to use a zero-based induction variable
10262    when vectorizing LOOP_VINFO with partial vectors.  If it is, return
10263    the value that the induction variable must be able to hold in order
10264    to ensure that the rgroups eventually have no active vector elements.
10265    Return -1 otherwise.  */
10266
10267 widest_int
10268 vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
10269 {
10270   tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
10271   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10272   unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
10273
10274   /* Calculate the value that the induction variable must be able
10275      to hit in order to ensure that we end the loop with an all-false mask.
10276      This involves adding the maximum number of inactive trailing scalar
10277      iterations.  */
10278   widest_int iv_limit = -1;
10279   if (max_loop_iterations (loop, &iv_limit))
10280     {
10281       if (niters_skip)
10282         {
10283           /* Add the maximum number of skipped iterations to the
10284              maximum iteration count.  */
10285           if (TREE_CODE (niters_skip) == INTEGER_CST)
10286             iv_limit += wi::to_widest (niters_skip);
10287           else
10288             iv_limit += max_vf - 1;
10289         }
10290       else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
10291         /* Make a conservatively-correct assumption.  */
10292         iv_limit += max_vf - 1;
10293
10294       /* IV_LIMIT is the maximum number of latch iterations, which is also
10295          the maximum in-range IV value.  Round this value down to the previous
10296          vector alignment boundary and then add an extra full iteration.  */
10297       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
10298       iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
10299     }
10300   return iv_limit;
10301 }
10302
10303 /* For the given rgroup_controls RGC, check whether an induction variable
10304    would ever hit a value that produces a set of all-false masks or zero
10305    lengths before wrapping around.  Return true if it's possible to wrap
10306    around before hitting the desirable value, otherwise return false.  */
10307
10308 bool
10309 vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
10310 {
10311   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
10312
10313   if (iv_limit == -1)
10314     return true;
10315
10316   tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
10317   unsigned int compare_precision = TYPE_PRECISION (compare_type);
10318   unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
10319
10320   if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
10321     return true;
10322
10323   return false;
10324 }