gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2021 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #define INCLUDE_ALGORITHM
  23 #include "config.h"
  24 #include "system.h"
  25 #include "coretypes.h"
  26 #include "backend.h"
  27 #include "target.h"
  28 #include "rtl.h"
  29 #include "tree.h"
  30 #include "gimple.h"
  31 #include "cfghooks.h"
  32 #include "tree-pass.h"
  33 #include "ssa.h"
  34 #include "optabs-tree.h"
  35 #include "diagnostic-core.h"
  36 #include "fold-const.h"
  37 #include "stor-layout.h"
  38 #include "cfganal.h"
  39 #include "gimplify.h"
  40 #include "gimple-iterator.h"
  41 #include "gimplify-me.h"
  42 #include "tree-ssa-loop-ivopts.h"
  43 #include "tree-ssa-loop-manip.h"
  44 #include "tree-ssa-loop-niter.h"
  45 #include "tree-ssa-loop.h"
  46 #include "cfgloop.h"
  47 #include "tree-scalar-evolution.h"
  48 #include "tree-vectorizer.h"
  49 #include "gimple-fold.h"
  50 #include "cgraph.h"
  51 #include "tree-cfg.h"
  52 #include "tree-if-conv.h"
  53 #include "internal-fn.h"
  54 #include "tree-vector-builder.h"
  55 #include "vec-perm-indices.h"
  56 #include "tree-eh.h"
  57 #include "case-cfn-macros.h"
  58
  59 /* Loop Vectorization Pass.
  60
  61    This pass tries to vectorize loops.
  62
  63    For example, the vectorizer transforms the following simple loop:
  64
  65         short a[N]; short b[N]; short c[N]; int i;
  66
  67         for (i=0; i<N; i++){
  68           a[i] = b[i] + c[i];
  69         }
  70
  71    as if it was manually vectorized by rewriting the source code into:
  72
  73         typedef int __attribute__((mode(V8HI))) v8hi;
  74         short a[N];  short b[N]; short c[N];   int i;
  75         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  76         v8hi va, vb, vc;
  77
  78         for (i=0; i<N/8; i++){
  79           vb = pb[i];
  80           vc = pc[i];
  81           va = vb + vc;
  82           pa[i] = va;
  83         }
  84
  85         The main entry to this pass is vectorize_loops(), in which
  86    the vectorizer applies a set of analyses on a given set of loops,
  87    followed by the actual vectorization transformation for the loops that
  88    had successfully passed the analysis phase.
  89         Throughout this pass we make a distinction between two types of
  90    data: scalars (which are represented by SSA_NAMES), and memory references
  91    ("data-refs").  These two types of data require different handling both
  92    during analysis and transformation. The types of data-refs that the
  93    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  94    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  95    accesses are required to have a simple (consecutive) access pattern.
  96
  97    Analysis phase:
  98    ===============
  99         The driver for the analysis phase is vect_analyze_loop().
 100    It applies a set of analyses, some of which rely on the scalar evolution
 101    analyzer (scev) developed by Sebastian Pop.
 102
 103         During the analysis phase the vectorizer records some information
 104    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 105    loop, as well as general information about the loop as a whole, which is
 106    recorded in a "loop_vec_info" struct attached to each loop.
 107
 108    Transformation phase:
 109    =====================
 110         The loop transformation phase scans all the stmts in the loop, and
 111    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 112    the loop that needs to be vectorized.  It inserts the vector code sequence
 113    just before the scalar stmt S, and records a pointer to the vector code
 114    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 115    attached to S).  This pointer will be used for the vectorization of following
 116    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 117    otherwise, we rely on dead code elimination for removing it.
 118
 119         For example, say stmt S1 was vectorized into stmt VS1:
 120
 121    VS1: vb = px[i];
 122    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 123    S2:  a = b;
 124
 125    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 126    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 127    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 128    resulting sequence would be:
 129
 130    VS1: vb = px[i];
 131    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 132    VS2: va = vb;
 133    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 134
 135         Operands that are not SSA_NAMEs, are data-refs that appear in
 136    load/store operations (like 'x[i]' in S1), and are handled differently.
 137
 138    Target modeling:
 139    =================
 140         Currently the only target specific information that is used is the
 141    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 142    Targets that can support different sizes of vectors, for now will need
 143    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 144    flexibility will be added in the future.
 145
 146         Since we only vectorize operations which vector form can be
 147    expressed using existing tree codes, to verify that an operation is
 148    supported, the vectorizer checks the relevant optab at the relevant
 149    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 150    the value found is CODE_FOR_nothing, then there's no target support, and
 151    we can't vectorize the stmt.
 152
 153    For additional information on this project see:
 154    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 155 */
 156
 157 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 158 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
 159                                                bool *, bool *);
 160
 161 /* Subroutine of vect_determine_vf_for_stmt that handles only one
 162    statement.  VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
 163    may already be set for general statements (not just data refs).  */
 164
 165 static opt_result
 166 vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
 167                               bool vectype_maybe_set_p,
 168                               poly_uint64 *vf)
 169 {
 170   gimple *stmt = stmt_info->stmt;
 171
 172   if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 173        && !STMT_VINFO_LIVE_P (stmt_info))
 174       || gimple_clobber_p (stmt))
 175     {
 176       if (dump_enabled_p ())
 177         dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 178       return opt_result::success ();
 179     }
 180
 181   tree stmt_vectype, nunits_vectype;
 182   opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info,
 183                                                    &stmt_vectype,
 184                                                    &nunits_vectype);
 185   if (!res)
 186     return res;
 187
 188   if (stmt_vectype)
 189     {
 190       if (STMT_VINFO_VECTYPE (stmt_info))
 191         /* The only case when a vectype had been already set is for stmts
 192            that contain a data ref, or for "pattern-stmts" (stmts generated
 193            by the vectorizer to represent/replace a certain idiom).  */
 194         gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
 195                      || vectype_maybe_set_p)
 196                     && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
 197       else
 198         STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
 199     }
 200
 201   if (nunits_vectype)
 202     vect_update_max_nunits (vf, nunits_vectype);
 203
 204   return opt_result::success ();
 205 }
 206
 207 /* Subroutine of vect_determine_vectorization_factor.  Set the vector
 208    types of STMT_INFO and all attached pattern statements and update
 209    the vectorization factor VF accordingly.  Return true on success
 210    or false if something prevented vectorization.  */
 211
 212 static opt_result
 213 vect_determine_vf_for_stmt (vec_info *vinfo,
 214                             stmt_vec_info stmt_info, poly_uint64 *vf)
 215 {
 216   if (dump_enabled_p ())
 217     dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
 218                      stmt_info->stmt);
 219   opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, false, vf);
 220   if (!res)
 221     return res;
 222
 223   if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 224       && STMT_VINFO_RELATED_STMT (stmt_info))
 225     {
 226       gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 227       stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
 228
 229       /* If a pattern statement has def stmts, analyze them too.  */
 230       for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
 231            !gsi_end_p (si); gsi_next (&si))
 232         {
 233           stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
 234           if (dump_enabled_p ())
 235             dump_printf_loc (MSG_NOTE, vect_location,
 236                              "==> examining pattern def stmt: %G",
 237                              def_stmt_info->stmt);
 238           res = vect_determine_vf_for_stmt_1 (vinfo, def_stmt_info, true, vf);
 239           if (!res)
 240             return res;
 241         }
 242
 243       if (dump_enabled_p ())
 244         dump_printf_loc (MSG_NOTE, vect_location,
 245                          "==> examining pattern statement: %G",
 246                          stmt_info->stmt);
 247       res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, true, vf);
 248       if (!res)
 249         return res;
 250     }
 251
 252   return opt_result::success ();
 253 }
 254
 255 /* Function vect_determine_vectorization_factor
 256
 257    Determine the vectorization factor (VF).  VF is the number of data elements
 258    that are operated upon in parallel in a single iteration of the vectorized
 259    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 260    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 261    elements can fit in a single vector register.
 262
 263    We currently support vectorization of loops in which all types operated upon
 264    are of the same size.  Therefore this function currently sets VF according to
 265    the size of the types operated upon, and fails if there are multiple sizes
 266    in the loop.
 267
 268    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 269    original loop:
 270         for (i=0; i<N; i++){
 271           a[i] = b[i] + c[i];
 272         }
 273
 274    vectorized loop:
 275         for (i=0; i<N; i+=VF){
 276           a[i:VF] = b[i:VF] + c[i:VF];
 277         }
 278 */
 279
 280 static opt_result
 281 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 282 {
 283   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 284   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 285   unsigned nbbs = loop->num_nodes;
 286   poly_uint64 vectorization_factor = 1;
 287   tree scalar_type = NULL_TREE;
 288   gphi *phi;
 289   tree vectype;
 290   stmt_vec_info stmt_info;
 291   unsigned i;
 292
 293   DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
 294
 295   for (i = 0; i < nbbs; i++)
 296     {
 297       basic_block bb = bbs[i];
 298
 299       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 300            gsi_next (&si))
 301         {
 302           phi = si.phi ();
 303           stmt_info = loop_vinfo->lookup_stmt (phi);
 304           if (dump_enabled_p ())
 305             dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
 306                              phi);
 307
 308           gcc_assert (stmt_info);
 309
 310           if (STMT_VINFO_RELEVANT_P (stmt_info)
 311               || STMT_VINFO_LIVE_P (stmt_info))
 312             {
 313               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 314               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 315
 316               if (dump_enabled_p ())
 317                 dump_printf_loc (MSG_NOTE, vect_location,
 318                                  "get vectype for scalar type:  %T\n",
 319                                  scalar_type);
 320
 321               vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
 322               if (!vectype)
 323                 return opt_result::failure_at (phi,
 324                                                "not vectorized: unsupported "
 325                                                "data-type %T\n",
 326                                                scalar_type);
 327               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 328
 329               if (dump_enabled_p ())
 330                 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
 331                                  vectype);
 332
 333               if (dump_enabled_p ())
 334                 {
 335                   dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
 336                   dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
 337                   dump_printf (MSG_NOTE, "\n");
 338                 }
 339
 340               vect_update_max_nunits (&vectorization_factor, vectype);
 341             }
 342         }
 343
 344       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
 345            gsi_next (&si))
 346         {
 347           if (is_gimple_debug (gsi_stmt (si)))
 348             continue;
 349           stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
 350           opt_result res
 351             = vect_determine_vf_for_stmt (loop_vinfo,
 352                                           stmt_info, &vectorization_factor);
 353           if (!res)
 354             return res;
 355         }
 356     }
 357
 358   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 359   if (dump_enabled_p ())
 360     {
 361       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
 362       dump_dec (MSG_NOTE, vectorization_factor);
 363       dump_printf (MSG_NOTE, "\n");
 364     }
 365
 366   if (known_le (vectorization_factor, 1U))
 367     return opt_result::failure_at (vect_location,
 368                                    "not vectorized: unsupported data-type\n");
 369   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 370   return opt_result::success ();
 371 }
 372
 373
 374 /* Function vect_is_simple_iv_evolution.
 375
 376    FORNOW: A simple evolution of an induction variables in the loop is
 377    considered a polynomial evolution.  */
 378
 379 static bool
 380 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 381                              tree * step)
 382 {
 383   tree init_expr;
 384   tree step_expr;
 385   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 386   basic_block bb;
 387
 388   /* When there is no evolution in this loop, the evolution function
 389      is not "simple".  */
 390   if (evolution_part == NULL_TREE)
 391     return false;
 392
 393   /* When the evolution is a polynomial of degree >= 2
 394      the evolution function is not "simple".  */
 395   if (tree_is_chrec (evolution_part))
 396     return false;
 397
 398   step_expr = evolution_part;
 399   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 400
 401   if (dump_enabled_p ())
 402     dump_printf_loc (MSG_NOTE, vect_location, "step: %T,  init: %T\n",
 403                      step_expr, init_expr);
 404
 405   *init = init_expr;
 406   *step = step_expr;
 407
 408   if (TREE_CODE (step_expr) != INTEGER_CST
 409       && (TREE_CODE (step_expr) != SSA_NAME
 410           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 411               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 412           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 413               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 414                   || !flag_associative_math)))
 415       && (TREE_CODE (step_expr) != REAL_CST
 416           || !flag_associative_math))
 417     {
 418       if (dump_enabled_p ())
 419         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 420                          "step unknown.\n");
 421       return false;
 422     }
 423
 424   return true;
 425 }
 426
 427 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
 428    what we are assuming is a double reduction.  For example, given
 429    a structure like this:
 430
 431       outer1:
 432         x_1 = PHI <x_4(outer2), ...>;
 433         ...
 434
 435       inner:
 436         x_2 = PHI <x_1(outer1), ...>;
 437         ...
 438         x_3 = ...;
 439         ...
 440
 441       outer2:
 442         x_4 = PHI <x_3(inner)>;
 443         ...
 444
 445    outer loop analysis would treat x_1 as a double reduction phi and
 446    this function would then return true for x_2.  */
 447
 448 static bool
 449 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
 450 {
 451   use_operand_p use_p;
 452   ssa_op_iter op_iter;
 453   FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
 454     if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
 455       if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
 456         return true;
 457   return false;
 458 }
 459
 460 /* Function vect_analyze_scalar_cycles_1.
 461
 462    Examine the cross iteration def-use cycles of scalar variables
 463    in LOOP.  LOOP_VINFO represents the loop that is now being
 464    considered for vectorization (can be LOOP, or an outer-loop
 465    enclosing LOOP).  */
 466
 467 static void
 468 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop)
 469 {
 470   basic_block bb = loop->header;
 471   tree init, step;
 472   auto_vec<stmt_vec_info, 64> worklist;
 473   gphi_iterator gsi;
 474   bool double_reduc, reduc_chain;
 475
 476   DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
 477
 478   /* First - identify all inductions.  Reduction detection assumes that all the
 479      inductions have been identified, therefore, this order must not be
 480      changed.  */
 481   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 482     {
 483       gphi *phi = gsi.phi ();
 484       tree access_fn = NULL;
 485       tree def = PHI_RESULT (phi);
 486       stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
 487
 488       if (dump_enabled_p ())
 489         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
 490
 491       /* Skip virtual phi's.  The data dependences that are associated with
 492          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 493       if (virtual_operand_p (def))
 494         continue;
 495
 496       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 497
 498       /* Analyze the evolution function.  */
 499       access_fn = analyze_scalar_evolution (loop, def);
 500       if (access_fn)
 501         {
 502           STRIP_NOPS (access_fn);
 503           if (dump_enabled_p ())
 504             dump_printf_loc (MSG_NOTE, vect_location,
 505                              "Access function of PHI: %T\n", access_fn);
 506           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 507             = initial_condition_in_loop_num (access_fn, loop->num);
 508           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 509             = evolution_part_in_loop_num (access_fn, loop->num);
 510         }
 511
 512       if (!access_fn
 513           || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
 514           || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
 515           || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 516               && TREE_CODE (step) != INTEGER_CST))
 517         {
 518           worklist.safe_push (stmt_vinfo);
 519           continue;
 520         }
 521
 522       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 523                   != NULL_TREE);
 524       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 525
 526       if (dump_enabled_p ())
 527         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 528       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 529     }
 530
 531
 532   /* Second - identify all reductions and nested cycles.  */
 533   while (worklist.length () > 0)
 534     {
 535       stmt_vec_info stmt_vinfo = worklist.pop ();
 536       gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
 537       tree def = PHI_RESULT (phi);
 538
 539       if (dump_enabled_p ())
 540         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
 541
 542       gcc_assert (!virtual_operand_p (def)
 543                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 544
 545       stmt_vec_info reduc_stmt_info
 546         = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
 547                                     &reduc_chain);
 548       if (reduc_stmt_info)
 549         {
 550           STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
 551           STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
 552           if (double_reduc)
 553             {
 554               if (dump_enabled_p ())
 555                 dump_printf_loc (MSG_NOTE, vect_location,
 556                                  "Detected double reduction.\n");
 557
 558               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 559               STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
 560             }
 561           else
 562             {
 563               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
 564                 {
 565                   if (dump_enabled_p ())
 566                     dump_printf_loc (MSG_NOTE, vect_location,
 567                                      "Detected vectorizable nested cycle.\n");
 568
 569                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 570                 }
 571               else
 572                 {
 573                   if (dump_enabled_p ())
 574                     dump_printf_loc (MSG_NOTE, vect_location,
 575                                      "Detected reduction.\n");
 576
 577                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 578                   STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
 579                   /* Store the reduction cycles for possible vectorization in
 580                      loop-aware SLP if it was not detected as reduction
 581                      chain.  */
 582                   if (! reduc_chain)
 583                     LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
 584                       (reduc_stmt_info);
 585                 }
 586             }
 587         }
 588       else
 589         if (dump_enabled_p ())
 590           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 591                            "Unknown def-use cycle pattern.\n");
 592     }
 593 }
 594
 595
 596 /* Function vect_analyze_scalar_cycles.
 597
 598    Examine the cross iteration def-use cycles of scalar variables, by
 599    analyzing the loop-header PHIs of scalar variables.  Classify each
 600    cycle as one of the following: invariant, induction, reduction, unknown.
 601    We do that for the loop represented by LOOP_VINFO, and also to its
 602    inner-loop, if exists.
 603    Examples for scalar cycles:
 604
 605    Example1: reduction:
 606
 607               loop1:
 608               for (i=0; i<N; i++)
 609                  sum += a[i];
 610
 611    Example2: induction:
 612
 613               loop2:
 614               for (i=0; i<N; i++)
 615                  a[i] = i;  */
 616
 617 static void
 618 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 619 {
 620   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 621
 622   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 623
 624   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 625      Reductions in such inner-loop therefore have different properties than
 626      the reductions in the nest that gets vectorized:
 627      1. When vectorized, they are executed in the same order as in the original
 628         scalar loop, so we can't change the order of computation when
 629         vectorizing them.
 630      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 631         current checks are too strict.  */
 632
 633   if (loop->inner)
 634     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 635 }
 636
 637 /* Transfer group and reduction information from STMT_INFO to its
 638    pattern stmt.  */
 639
 640 static void
 641 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
 642 {
 643   stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
 644   stmt_vec_info stmtp;
 645   gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
 646               && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
 647   REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
 648   do
 649     {
 650       stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
 651       gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
 652                            == STMT_VINFO_DEF_TYPE (stmt_info));
 653       REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
 654       stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
 655       if (stmt_info)
 656         REDUC_GROUP_NEXT_ELEMENT (stmtp)
 657           = STMT_VINFO_RELATED_STMT (stmt_info);
 658     }
 659   while (stmt_info);
 660 }
 661
 662 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 663
 664 static void
 665 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 666 {
 667   stmt_vec_info first;
 668   unsigned i;
 669
 670   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 671     {
 672       stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
 673       while (next)
 674         {
 675           if ((STMT_VINFO_IN_PATTERN_P (next)
 676                != STMT_VINFO_IN_PATTERN_P (first))
 677               || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next)) == -1)
 678             break;
 679           next = REDUC_GROUP_NEXT_ELEMENT (next);
 680         }
 681       /* If all reduction chain members are well-formed patterns adjust
 682          the group to group the pattern stmts instead.  */
 683       if (! next
 684           && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first)) != -1)
 685         {
 686           if (STMT_VINFO_IN_PATTERN_P (first))
 687             {
 688               vect_fixup_reduc_chain (first);
 689               LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
 690                 = STMT_VINFO_RELATED_STMT (first);
 691             }
 692         }
 693       /* If not all stmt in the chain are patterns or if we failed
 694          to update STMT_VINFO_REDUC_IDX dissolve the chain and handle
 695          it as regular reduction instead.  */
 696       else
 697         {
 698           stmt_vec_info vinfo = first;
 699           stmt_vec_info last = NULL;
 700           while (vinfo)
 701             {
 702               next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
 703               REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
 704               REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
 705               last = vinfo;
 706               vinfo = next;
 707             }
 708           STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first))
 709             = vect_internal_def;
 710           loop_vinfo->reductions.safe_push (vect_stmt_to_vectorize (last));
 711           LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).unordered_remove (i);
 712           --i;
 713         }
 714     }
 715 }
 716
 717 /* Function vect_get_loop_niters.
 718
 719    Determine how many iterations the loop is executed and place it
 720    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
 721    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
 722    niter information holds in ASSUMPTIONS.
 723
 724    Return the loop exit condition.  */
 725
 726
 727 static gcond *
 728 vect_get_loop_niters (class loop *loop, tree *assumptions,
 729                       tree *number_of_iterations, tree *number_of_iterationsm1)
 730 {
 731   edge exit = single_exit (loop);
 732   class tree_niter_desc niter_desc;
 733   tree niter_assumptions, niter, may_be_zero;
 734   gcond *cond = get_loop_exit_condition (loop);
 735
 736   *assumptions = boolean_true_node;
 737   *number_of_iterationsm1 = chrec_dont_know;
 738   *number_of_iterations = chrec_dont_know;
 739   DUMP_VECT_SCOPE ("get_loop_niters");
 740
 741   if (!exit)
 742     return cond;
 743
 744   may_be_zero = NULL_TREE;
 745   if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
 746       || chrec_contains_undetermined (niter_desc.niter))
 747     return cond;
 748
 749   niter_assumptions = niter_desc.assumptions;
 750   may_be_zero = niter_desc.may_be_zero;
 751   niter = niter_desc.niter;
 752
 753   if (may_be_zero && integer_zerop (may_be_zero))
 754     may_be_zero = NULL_TREE;
 755
 756   if (may_be_zero)
 757     {
 758       if (COMPARISON_CLASS_P (may_be_zero))
 759         {
 760           /* Try to combine may_be_zero with assumptions, this can simplify
 761              computation of niter expression.  */
 762           if (niter_assumptions && !integer_nonzerop (niter_assumptions))
 763             niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
 764                                              niter_assumptions,
 765                                              fold_build1 (TRUTH_NOT_EXPR,
 766                                                           boolean_type_node,
 767                                                           may_be_zero));
 768           else
 769             niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
 770                                  build_int_cst (TREE_TYPE (niter), 0),
 771                                  rewrite_to_non_trapping_overflow (niter));
 772
 773           may_be_zero = NULL_TREE;
 774         }
 775       else if (integer_nonzerop (may_be_zero))
 776         {
 777           *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
 778           *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
 779           return cond;
 780         }
 781       else
 782         return cond;
 783     }
 784
 785   *assumptions = niter_assumptions;
 786   *number_of_iterationsm1 = niter;
 787
 788   /* We want the number of loop header executions which is the number
 789      of latch executions plus one.
 790      ???  For UINT_MAX latch executions this number overflows to zero
 791      for loops like do { n++; } while (n != 0);  */
 792   if (niter && !chrec_contains_undetermined (niter))
 793     niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
 794                           build_int_cst (TREE_TYPE (niter), 1));
 795   *number_of_iterations = niter;
 796
 797   return cond;
 798 }
 799
 800 /* Function bb_in_loop_p
 801
 802    Used as predicate for dfs order traversal of the loop bbs.  */
 803
 804 static bool
 805 bb_in_loop_p (const_basic_block bb, const void *data)
 806 {
 807   const class loop *const loop = (const class loop *)data;
 808   if (flow_bb_inside_loop_p (loop, bb))
 809     return true;
 810   return false;
 811 }
 812
 813
 814 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
 815    stmt_vec_info structs for all the stmts in LOOP_IN.  */
 816
 817 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
 818   : vec_info (vec_info::loop, shared),
 819     loop (loop_in),
 820     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
 821     num_itersm1 (NULL_TREE),
 822     num_iters (NULL_TREE),
 823     num_iters_unchanged (NULL_TREE),
 824     num_iters_assumptions (NULL_TREE),
 825     vector_costs (nullptr),
 826     scalar_costs (nullptr),
 827     th (0),
 828     versioning_threshold (0),
 829     vectorization_factor (0),
 830     main_loop_edge (nullptr),
 831     skip_main_loop_edge (nullptr),
 832     skip_this_loop_edge (nullptr),
 833     reusable_accumulators (),
 834     max_vectorization_factor (0),
 835     mask_skip_niters (NULL_TREE),
 836     rgroup_compare_type (NULL_TREE),
 837     simd_if_cond (NULL_TREE),
 838     unaligned_dr (NULL),
 839     peeling_for_alignment (0),
 840     ptr_mask (0),
 841     ivexpr_map (NULL),
 842     scan_map (NULL),
 843     slp_unrolling_factor (1),
 844     inner_loop_cost_factor (param_vect_inner_loop_cost_factor),
 845     vectorizable (false),
 846     can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
 847     using_partial_vectors_p (false),
 848     epil_using_partial_vectors_p (false),
 849     peeling_for_gaps (false),
 850     peeling_for_niter (false),
 851     no_data_dependencies (false),
 852     has_mask_store (false),
 853     scalar_loop_scaling (profile_probability::uninitialized ()),
 854     scalar_loop (NULL),
 855     orig_loop_info (NULL)
 856 {
 857   /* CHECKME: We want to visit all BBs before their successors (except for
 858      latch blocks, for which this assertion wouldn't hold).  In the simple
 859      case of the loop forms we allow, a dfs order of the BBs would the same
 860      as reversed postorder traversal, so we are safe.  */
 861
 862   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 863                                           bbs, loop->num_nodes, loop);
 864   gcc_assert (nbbs == loop->num_nodes);
 865
 866   for (unsigned int i = 0; i < nbbs; i++)
 867     {
 868       basic_block bb = bbs[i];
 869       gimple_stmt_iterator si;
 870
 871       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 872         {
 873           gimple *phi = gsi_stmt (si);
 874           gimple_set_uid (phi, 0);
 875           add_stmt (phi);
 876         }
 877
 878       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 879         {
 880           gimple *stmt = gsi_stmt (si);
 881           gimple_set_uid (stmt, 0);
 882           if (is_gimple_debug (stmt))
 883             continue;
 884           add_stmt (stmt);
 885           /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
 886              third argument is the #pragma omp simd if (x) condition, when 0,
 887              loop shouldn't be vectorized, when non-zero constant, it should
 888              be vectorized normally, otherwise versioned with vectorized loop
 889              done if the condition is non-zero at runtime.  */
 890           if (loop_in->simduid
 891               && is_gimple_call (stmt)
 892               && gimple_call_internal_p (stmt)
 893               && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
 894               && gimple_call_num_args (stmt) >= 3
 895               && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
 896               && (loop_in->simduid
 897                   == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
 898             {
 899               tree arg = gimple_call_arg (stmt, 2);
 900               if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
 901                 simd_if_cond = arg;
 902               else
 903                 gcc_assert (integer_nonzerop (arg));
 904             }
 905         }
 906     }
 907
 908   epilogue_vinfos.create (6);
 909 }
 910
 911 /* Free all levels of rgroup CONTROLS.  */
 912
 913 void
 914 release_vec_loop_controls (vec<rgroup_controls> *controls)
 915 {
 916   rgroup_controls *rgc;
 917   unsigned int i;
 918   FOR_EACH_VEC_ELT (*controls, i, rgc)
 919     rgc->controls.release ();
 920   controls->release ();
 921 }
 922
 923 /* Free all memory used by the _loop_vec_info, as well as all the
 924    stmt_vec_info structs of all the stmts in the loop.  */
 925
 926 _loop_vec_info::~_loop_vec_info ()
 927 {
 928   free (bbs);
 929
 930   release_vec_loop_controls (&masks);
 931   release_vec_loop_controls (&lens);
 932   delete ivexpr_map;
 933   delete scan_map;
 934   epilogue_vinfos.release ();
 935   delete scalar_costs;
 936   delete vector_costs;
 937
 938   /* When we release an epiloge vinfo that we do not intend to use
 939      avoid clearing AUX of the main loop which should continue to
 940      point to the main loop vinfo since otherwise we'll leak that.  */
 941   if (loop->aux == this)
 942     loop->aux = NULL;
 943 }
 944
 945 /* Return an invariant or register for EXPR and emit necessary
 946    computations in the LOOP_VINFO loop preheader.  */
 947
 948 tree
 949 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
 950 {
 951   if (is_gimple_reg (expr)
 952       || is_gimple_min_invariant (expr))
 953     return expr;
 954
 955   if (! loop_vinfo->ivexpr_map)
 956     loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
 957   tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
 958   if (! cached)
 959     {
 960       gimple_seq stmts = NULL;
 961       cached = force_gimple_operand (unshare_expr (expr),
 962                                      &stmts, true, NULL_TREE);
 963       if (stmts)
 964         {
 965           edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
 966           gsi_insert_seq_on_edge_immediate (e, stmts);
 967         }
 968     }
 969   return cached;
 970 }
 971
 972 /* Return true if we can use CMP_TYPE as the comparison type to produce
 973    all masks required to mask LOOP_VINFO.  */
 974
 975 static bool
 976 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
 977 {
 978   rgroup_controls *rgm;
 979   unsigned int i;
 980   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
 981     if (rgm->type != NULL_TREE
 982         && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
 983                                             cmp_type, rgm->type,
 984                                             OPTIMIZE_FOR_SPEED))
 985       return false;
 986   return true;
 987 }
 988
 989 /* Calculate the maximum number of scalars per iteration for every
 990    rgroup in LOOP_VINFO.  */
 991
 992 static unsigned int
 993 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
 994 {
 995   unsigned int res = 1;
 996   unsigned int i;
 997   rgroup_controls *rgm;
 998   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
 999     res = MAX (res, rgm->max_nscalars_per_iter);
1000   return res;
1001 }
1002
1003 /* Calculate the minimum precision necessary to represent:
1004
1005       MAX_NITERS * FACTOR
1006
1007    as an unsigned integer, where MAX_NITERS is the maximum number of
1008    loop header iterations for the original scalar form of LOOP_VINFO.  */
1009
1010 static unsigned
1011 vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
1012 {
1013   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1014
1015   /* Get the maximum number of iterations that is representable
1016      in the counter type.  */
1017   tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1018   widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1019
1020   /* Get a more refined estimate for the number of iterations.  */
1021   widest_int max_back_edges;
1022   if (max_loop_iterations (loop, &max_back_edges))
1023     max_ni = wi::smin (max_ni, max_back_edges + 1);
1024
1025   /* Work out how many bits we need to represent the limit.  */
1026   return wi::min_precision (max_ni * factor, UNSIGNED);
1027 }
1028
1029 /* True if the loop needs peeling or partial vectors when vectorized.  */
1030
1031 static bool
1032 vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
1033 {
1034   unsigned HOST_WIDE_INT const_vf;
1035   HOST_WIDE_INT max_niter
1036     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1037
1038   unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1039   if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1040     th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1041                                           (loop_vinfo));
1042
1043   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1044       && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1045     {
1046       /* Work out the (constant) number of iterations that need to be
1047          peeled for reasons other than niters.  */
1048       unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1049       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1050         peel_niter += 1;
1051       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1052                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1053         return true;
1054     }
1055   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1056       /* ??? When peeling for gaps but not alignment, we could
1057          try to check whether the (variable) niters is known to be
1058          VF * N + 1.  That's something of a niche case though.  */
1059       || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1060       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1061       || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1062            < (unsigned) exact_log2 (const_vf))
1063           /* In case of versioning, check if the maximum number of
1064              iterations is greater than th.  If they are identical,
1065              the epilogue is unnecessary.  */
1066           && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1067               || ((unsigned HOST_WIDE_INT) max_niter
1068                   > (th / const_vf) * const_vf))))
1069     return true;
1070
1071   return false;
1072 }
1073
1074 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1075    whether we can actually generate the masks required.  Return true if so,
1076    storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE.  */
1077
1078 static bool
1079 vect_verify_full_masking (loop_vec_info loop_vinfo)
1080 {
1081   unsigned int min_ni_width;
1082   unsigned int max_nscalars_per_iter
1083     = vect_get_max_nscalars_per_iter (loop_vinfo);
1084
1085   /* Use a normal loop if there are no statements that need masking.
1086      This only happens in rare degenerate cases: it means that the loop
1087      has no loads, no stores, and no live-out values.  */
1088   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1089     return false;
1090
1091   /* Work out how many bits we need to represent the limit.  */
1092   min_ni_width
1093     = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1094
1095   /* Find a scalar mode for which WHILE_ULT is supported.  */
1096   opt_scalar_int_mode cmp_mode_iter;
1097   tree cmp_type = NULL_TREE;
1098   tree iv_type = NULL_TREE;
1099   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1100   unsigned int iv_precision = UINT_MAX;
1101
1102   if (iv_limit != -1)
1103     iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1104                                       UNSIGNED);
1105
1106   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1107     {
1108       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1109       if (cmp_bits >= min_ni_width
1110           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1111         {
1112           tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1113           if (this_type
1114               && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1115             {
1116               /* Although we could stop as soon as we find a valid mode,
1117                  there are at least two reasons why that's not always the
1118                  best choice:
1119
1120                  - An IV that's Pmode or wider is more likely to be reusable
1121                    in address calculations than an IV that's narrower than
1122                    Pmode.
1123
1124                  - Doing the comparison in IV_PRECISION or wider allows
1125                    a natural 0-based IV, whereas using a narrower comparison
1126                    type requires mitigations against wrap-around.
1127
1128                  Conversely, if the IV limit is variable, doing the comparison
1129                  in a wider type than the original type can introduce
1130                  unnecessary extensions, so picking the widest valid mode
1131                  is not always a good choice either.
1132
1133                  Here we prefer the first IV type that's Pmode or wider,
1134                  and the first comparison type that's IV_PRECISION or wider.
1135                  (The comparison type must be no wider than the IV type,
1136                  to avoid extensions in the vector loop.)
1137
1138                  ??? We might want to try continuing beyond Pmode for ILP32
1139                  targets if CMP_BITS < IV_PRECISION.  */
1140               iv_type = this_type;
1141               if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1142                 cmp_type = this_type;
1143               if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1144                 break;
1145             }
1146         }
1147     }
1148
1149   if (!cmp_type)
1150     return false;
1151
1152   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1153   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1154   return true;
1155 }
1156
1157 /* Check whether we can use vector access with length based on precison
1158    comparison.  So far, to keep it simple, we only allow the case that the
1159    precision of the target supported length is larger than the precision
1160    required by loop niters.  */
1161
1162 static bool
1163 vect_verify_loop_lens (loop_vec_info loop_vinfo)
1164 {
1165   if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1166     return false;
1167
1168   unsigned int max_nitems_per_iter = 1;
1169   unsigned int i;
1170   rgroup_controls *rgl;
1171   /* Find the maximum number of items per iteration for every rgroup.  */
1172   FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1173     {
1174       unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1175       max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1176     }
1177
1178   /* Work out how many bits we need to represent the length limit.  */
1179   unsigned int min_ni_prec
1180     = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1181
1182   /* Now use the maximum of below precisions for one suitable IV type:
1183      - the IV's natural precision
1184      - the precision needed to hold: the maximum number of scalar
1185        iterations multiplied by the scale factor (min_ni_prec above)
1186      - the Pmode precision
1187
1188      If min_ni_prec is less than the precision of the current niters,
1189      we perfer to still use the niters type.  Prefer to use Pmode and
1190      wider IV to avoid narrow conversions.  */
1191
1192   unsigned int ni_prec
1193     = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1194   min_ni_prec = MAX (min_ni_prec, ni_prec);
1195   min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1196
1197   tree iv_type = NULL_TREE;
1198   opt_scalar_int_mode tmode_iter;
1199   FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1200     {
1201       scalar_mode tmode = tmode_iter.require ();
1202       unsigned int tbits = GET_MODE_BITSIZE (tmode);
1203
1204       /* ??? Do we really want to construct one IV whose precision exceeds
1205          BITS_PER_WORD?  */
1206       if (tbits > BITS_PER_WORD)
1207         break;
1208
1209       /* Find the first available standard integral type.  */
1210       if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1211         {
1212           iv_type = build_nonstandard_integer_type (tbits, true);
1213           break;
1214         }
1215     }
1216
1217   if (!iv_type)
1218     {
1219       if (dump_enabled_p ())
1220         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1221                          "can't vectorize with length-based partial vectors"
1222                          " because there is no suitable iv type.\n");
1223       return false;
1224     }
1225
1226   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1227   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1228
1229   return true;
1230 }
1231
1232 /* Calculate the cost of one scalar iteration of the loop.  */
1233 static void
1234 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1235 {
1236   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1237   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1238   int nbbs = loop->num_nodes, factor;
1239   int innerloop_iters, i;
1240
1241   DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1242
1243   /* Gather costs for statements in the scalar loop.  */
1244
1245   /* FORNOW.  */
1246   innerloop_iters = 1;
1247   if (loop->inner)
1248     innerloop_iters = LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo);
1249
1250   for (i = 0; i < nbbs; i++)
1251     {
1252       gimple_stmt_iterator si;
1253       basic_block bb = bbs[i];
1254
1255       if (bb->loop_father == loop->inner)
1256         factor = innerloop_iters;
1257       else
1258         factor = 1;
1259
1260       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1261         {
1262           gimple *stmt = gsi_stmt (si);
1263           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1264
1265           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1266             continue;
1267
1268           /* Skip stmts that are not vectorized inside the loop.  */
1269           stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1270           if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1271               && (!STMT_VINFO_LIVE_P (vstmt_info)
1272                   || !VECTORIZABLE_CYCLE_DEF
1273                         (STMT_VINFO_DEF_TYPE (vstmt_info))))
1274             continue;
1275
1276           vect_cost_for_stmt kind;
1277           if (STMT_VINFO_DATA_REF (stmt_info))
1278             {
1279               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1280                kind = scalar_load;
1281              else
1282                kind = scalar_store;
1283             }
1284           else if (vect_nop_conversion_p (stmt_info))
1285             continue;
1286           else
1287             kind = scalar_stmt;
1288
1289           /* We are using vect_prologue here to avoid scaling twice
1290              by the inner loop factor.  */
1291           record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1292                             factor, kind, stmt_info, 0, vect_prologue);
1293         }
1294     }
1295
1296   /* Now accumulate cost.  */
1297   loop_vinfo->scalar_costs = init_cost (loop_vinfo, true);
1298   stmt_info_for_cost *si;
1299   int j;
1300   FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1301                     j, si)
1302     (void) add_stmt_cost (loop_vinfo->scalar_costs, si->count,
1303                           si->kind, si->stmt_info, si->vectype,
1304                           si->misalign, si->where);
1305   loop_vinfo->scalar_costs->finish_cost (nullptr);
1306 }
1307
1308
1309 /* Function vect_analyze_loop_form.
1310
1311    Verify that certain CFG restrictions hold, including:
1312    - the loop has a pre-header
1313    - the loop has a single entry and exit
1314    - the loop exit condition is simple enough
1315    - the number of iterations can be analyzed, i.e, a countable loop.  The
1316      niter could be analyzed under some assumptions.  */
1317
1318 opt_result
1319 vect_analyze_loop_form (class loop *loop, vect_loop_form_info *info)
1320 {
1321   DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1322
1323   /* Different restrictions apply when we are considering an inner-most loop,
1324      vs. an outer (nested) loop.
1325      (FORNOW. May want to relax some of these restrictions in the future).  */
1326
1327   info->inner_loop_cond = NULL;
1328   if (!loop->inner)
1329     {
1330       /* Inner-most loop.  We currently require that the number of BBs is
1331          exactly 2 (the header and latch).  Vectorizable inner-most loops
1332          look like this:
1333
1334                         (pre-header)
1335                            |
1336                           header <--------+
1337                            | |            |
1338                            | +--> latch --+
1339                            |
1340                         (exit-bb)  */
1341
1342       if (loop->num_nodes != 2)
1343         return opt_result::failure_at (vect_location,
1344                                        "not vectorized:"
1345                                        " control flow in loop.\n");
1346
1347       if (empty_block_p (loop->header))
1348         return opt_result::failure_at (vect_location,
1349                                        "not vectorized: empty loop.\n");
1350     }
1351   else
1352     {
1353       class loop *innerloop = loop->inner;
1354       edge entryedge;
1355
1356       /* Nested loop. We currently require that the loop is doubly-nested,
1357          contains a single inner loop, and the number of BBs is exactly 5.
1358          Vectorizable outer-loops look like this:
1359
1360                         (pre-header)
1361                            |
1362                           header <---+
1363                            |         |
1364                           inner-loop |
1365                            |         |
1366                           tail ------+
1367                            |
1368                         (exit-bb)
1369
1370          The inner-loop has the properties expected of inner-most loops
1371          as described above.  */
1372
1373       if ((loop->inner)->inner || (loop->inner)->next)
1374         return opt_result::failure_at (vect_location,
1375                                        "not vectorized:"
1376                                        " multiple nested loops.\n");
1377
1378       if (loop->num_nodes != 5)
1379         return opt_result::failure_at (vect_location,
1380                                        "not vectorized:"
1381                                        " control flow in loop.\n");
1382
1383       entryedge = loop_preheader_edge (innerloop);
1384       if (entryedge->src != loop->header
1385           || !single_exit (innerloop)
1386           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1387         return opt_result::failure_at (vect_location,
1388                                        "not vectorized:"
1389                                        " unsupported outerloop form.\n");
1390
1391       /* Analyze the inner-loop.  */
1392       vect_loop_form_info inner;
1393       opt_result res = vect_analyze_loop_form (loop->inner, &inner);
1394       if (!res)
1395         {
1396           if (dump_enabled_p ())
1397             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1398                              "not vectorized: Bad inner loop.\n");
1399           return res;
1400         }
1401
1402       /* Don't support analyzing niter under assumptions for inner
1403          loop.  */
1404       if (!integer_onep (inner.assumptions))
1405         return opt_result::failure_at (vect_location,
1406                                        "not vectorized: Bad inner loop.\n");
1407
1408       if (!expr_invariant_in_loop_p (loop, inner.number_of_iterations))
1409         return opt_result::failure_at (vect_location,
1410                                        "not vectorized: inner-loop count not"
1411                                        " invariant.\n");
1412
1413       if (dump_enabled_p ())
1414         dump_printf_loc (MSG_NOTE, vect_location,
1415                          "Considering outer-loop vectorization.\n");
1416       info->inner_loop_cond = inner.loop_cond;
1417     }
1418
1419   if (!single_exit (loop))
1420     return opt_result::failure_at (vect_location,
1421                                    "not vectorized: multiple exits.\n");
1422   if (EDGE_COUNT (loop->header->preds) != 2)
1423     return opt_result::failure_at (vect_location,
1424                                    "not vectorized:"
1425                                    " too many incoming edges.\n");
1426
1427   /* We assume that the loop exit condition is at the end of the loop. i.e,
1428      that the loop is represented as a do-while (with a proper if-guard
1429      before the loop if needed), where the loop header contains all the
1430      executable statements, and the latch is empty.  */
1431   if (!empty_block_p (loop->latch)
1432       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1433     return opt_result::failure_at (vect_location,
1434                                    "not vectorized: latch block not empty.\n");
1435
1436   /* Make sure the exit is not abnormal.  */
1437   edge e = single_exit (loop);
1438   if (e->flags & EDGE_ABNORMAL)
1439     return opt_result::failure_at (vect_location,
1440                                    "not vectorized:"
1441                                    " abnormal loop exit edge.\n");
1442
1443   info->loop_cond
1444     = vect_get_loop_niters (loop, &info->assumptions,
1445                             &info->number_of_iterations,
1446                             &info->number_of_iterationsm1);
1447   if (!info->loop_cond)
1448     return opt_result::failure_at
1449       (vect_location,
1450        "not vectorized: complicated exit condition.\n");
1451
1452   if (integer_zerop (info->assumptions)
1453       || !info->number_of_iterations
1454       || chrec_contains_undetermined (info->number_of_iterations))
1455     return opt_result::failure_at
1456       (info->loop_cond,
1457        "not vectorized: number of iterations cannot be computed.\n");
1458
1459   if (integer_zerop (info->number_of_iterations))
1460     return opt_result::failure_at
1461       (info->loop_cond,
1462        "not vectorized: number of iterations = 0.\n");
1463
1464   if (!(tree_fits_shwi_p (info->number_of_iterations)
1465         && tree_to_shwi (info->number_of_iterations) > 0))
1466     {
1467       if (dump_enabled_p ())
1468         {
1469           dump_printf_loc (MSG_NOTE, vect_location,
1470                            "Symbolic number of iterations is ");
1471           dump_generic_expr (MSG_NOTE, TDF_DETAILS, info->number_of_iterations);
1472           dump_printf (MSG_NOTE, "\n");
1473         }
1474     }
1475
1476   return opt_result::success ();
1477 }
1478
1479 /* Create a loop_vec_info for LOOP with SHARED and the
1480    vect_analyze_loop_form result.  */
1481
1482 loop_vec_info
1483 vect_create_loop_vinfo (class loop *loop, vec_info_shared *shared,
1484                         const vect_loop_form_info *info,
1485                         loop_vec_info main_loop_info)
1486 {
1487   loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1488   LOOP_VINFO_NITERSM1 (loop_vinfo) = info->number_of_iterationsm1;
1489   LOOP_VINFO_NITERS (loop_vinfo) = info->number_of_iterations;
1490   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = info->number_of_iterations;
1491   LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = main_loop_info;
1492   /* Also record the assumptions for versioning.  */
1493   if (!integer_onep (info->assumptions) && !main_loop_info)
1494     LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = info->assumptions;
1495
1496   stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (info->loop_cond);
1497   STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1498   if (info->inner_loop_cond)
1499     {
1500       stmt_vec_info inner_loop_cond_info
1501         = loop_vinfo->lookup_stmt (info->inner_loop_cond);
1502       STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1503       /* If we have an estimate on the number of iterations of the inner
1504          loop use that to limit the scale for costing, otherwise use
1505          --param vect-inner-loop-cost-factor literally.  */
1506       widest_int nit;
1507       if (estimated_stmt_executions (loop->inner, &nit))
1508         LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo)
1509           = wi::smin (nit, param_vect_inner_loop_cost_factor).to_uhwi ();
1510     }
1511
1512   return loop_vinfo;
1513 }
1514
1515
1516
1517 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1518    statements update the vectorization factor.  */
1519
1520 static void
1521 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1522 {
1523   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1524   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1525   int nbbs = loop->num_nodes;
1526   poly_uint64 vectorization_factor;
1527   int i;
1528
1529   DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1530
1531   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1532   gcc_assert (known_ne (vectorization_factor, 0U));
1533
1534   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1535      vectorization factor of the loop is the unrolling factor required by
1536      the SLP instances.  If that unrolling factor is 1, we say, that we
1537      perform pure SLP on loop - cross iteration parallelism is not
1538      exploited.  */
1539   bool only_slp_in_loop = true;
1540   for (i = 0; i < nbbs; i++)
1541     {
1542       basic_block bb = bbs[i];
1543       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1544            gsi_next (&si))
1545         {
1546           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1547           if (!stmt_info)
1548             continue;
1549           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1550                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1551               && !PURE_SLP_STMT (stmt_info))
1552             /* STMT needs both SLP and loop-based vectorization.  */
1553             only_slp_in_loop = false;
1554         }
1555       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1556            gsi_next (&si))
1557         {
1558           if (is_gimple_debug (gsi_stmt (si)))
1559             continue;
1560           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1561           stmt_info = vect_stmt_to_vectorize (stmt_info);
1562           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1563                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1564               && !PURE_SLP_STMT (stmt_info))
1565             /* STMT needs both SLP and loop-based vectorization.  */
1566             only_slp_in_loop = false;
1567         }
1568     }
1569
1570   if (only_slp_in_loop)
1571     {
1572       if (dump_enabled_p ())
1573         dump_printf_loc (MSG_NOTE, vect_location,
1574                          "Loop contains only SLP stmts\n");
1575       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1576     }
1577   else
1578     {
1579       if (dump_enabled_p ())
1580         dump_printf_loc (MSG_NOTE, vect_location,
1581                          "Loop contains SLP and non-SLP stmts\n");
1582       /* Both the vectorization factor and unroll factor have the form
1583          GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
1584          so they must have a common multiple.  */
1585       vectorization_factor
1586         = force_common_multiple (vectorization_factor,
1587                                  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1588     }
1589
1590   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1591   if (dump_enabled_p ())
1592     {
1593       dump_printf_loc (MSG_NOTE, vect_location,
1594                        "Updating vectorization factor to ");
1595       dump_dec (MSG_NOTE, vectorization_factor);
1596       dump_printf (MSG_NOTE, ".\n");
1597     }
1598 }
1599
1600 /* Return true if STMT_INFO describes a double reduction phi and if
1601    the other phi in the reduction is also relevant for vectorization.
1602    This rejects cases such as:
1603
1604       outer1:
1605         x_1 = PHI <x_3(outer2), ...>;
1606         ...
1607
1608       inner:
1609         x_2 = ...;
1610         ...
1611
1612       outer2:
1613         x_3 = PHI <x_2(inner)>;
1614
1615    if nothing in x_2 or elsewhere makes x_1 relevant.  */
1616
1617 static bool
1618 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1619 {
1620   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1621     return false;
1622
1623   return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1624 }
1625
1626 /* Function vect_analyze_loop_operations.
1627
1628    Scan the loop stmts and make sure they are all vectorizable.  */
1629
1630 static opt_result
1631 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1632 {
1633   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1634   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1635   int nbbs = loop->num_nodes;
1636   int i;
1637   stmt_vec_info stmt_info;
1638   bool need_to_vectorize = false;
1639   bool ok;
1640
1641   DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1642
1643   auto_vec<stmt_info_for_cost> cost_vec;
1644
1645   for (i = 0; i < nbbs; i++)
1646     {
1647       basic_block bb = bbs[i];
1648
1649       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1650            gsi_next (&si))
1651         {
1652           gphi *phi = si.phi ();
1653           ok = true;
1654
1655           stmt_info = loop_vinfo->lookup_stmt (phi);
1656           if (dump_enabled_p ())
1657             dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G", phi);
1658           if (virtual_operand_p (gimple_phi_result (phi)))
1659             continue;
1660
1661           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1662              (i.e., a phi in the tail of the outer-loop).  */
1663           if (! is_loop_header_bb_p (bb))
1664             {
1665               /* FORNOW: we currently don't support the case that these phis
1666                  are not used in the outerloop (unless it is double reduction,
1667                  i.e., this phi is vect_reduction_def), cause this case
1668                  requires to actually do something here.  */
1669               if (STMT_VINFO_LIVE_P (stmt_info)
1670                   && !vect_active_double_reduction_p (stmt_info))
1671                 return opt_result::failure_at (phi,
1672                                                "Unsupported loop-closed phi"
1673                                                " in outer-loop.\n");
1674
1675               /* If PHI is used in the outer loop, we check that its operand
1676                  is defined in the inner loop.  */
1677               if (STMT_VINFO_RELEVANT_P (stmt_info))
1678                 {
1679                   tree phi_op;
1680
1681                   if (gimple_phi_num_args (phi) != 1)
1682                     return opt_result::failure_at (phi, "unsupported phi");
1683
1684                   phi_op = PHI_ARG_DEF (phi, 0);
1685                   stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1686                   if (!op_def_info)
1687                     return opt_result::failure_at (phi, "unsupported phi\n");
1688
1689                   if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1690                       && (STMT_VINFO_RELEVANT (op_def_info)
1691                           != vect_used_in_outer_by_reduction))
1692                     return opt_result::failure_at (phi, "unsupported phi\n");
1693
1694                   if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
1695                        || (STMT_VINFO_DEF_TYPE (stmt_info)
1696                            == vect_double_reduction_def))
1697                       && !vectorizable_lc_phi (loop_vinfo,
1698                                                stmt_info, NULL, NULL))
1699                     return opt_result::failure_at (phi, "unsupported phi\n");
1700                 }
1701
1702               continue;
1703             }
1704
1705           gcc_assert (stmt_info);
1706
1707           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1708                || STMT_VINFO_LIVE_P (stmt_info))
1709               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1710             /* A scalar-dependence cycle that we don't support.  */
1711             return opt_result::failure_at (phi,
1712                                            "not vectorized:"
1713                                            " scalar dependence cycle.\n");
1714
1715           if (STMT_VINFO_RELEVANT_P (stmt_info))
1716             {
1717               need_to_vectorize = true;
1718               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1719                   && ! PURE_SLP_STMT (stmt_info))
1720                 ok = vectorizable_induction (loop_vinfo,
1721                                              stmt_info, NULL, NULL,
1722                                              &cost_vec);
1723               else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1724                         || (STMT_VINFO_DEF_TYPE (stmt_info)
1725                             == vect_double_reduction_def)
1726                         || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1727                        && ! PURE_SLP_STMT (stmt_info))
1728                 ok = vectorizable_reduction (loop_vinfo,
1729                                              stmt_info, NULL, NULL, &cost_vec);
1730             }
1731
1732           /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
1733           if (ok
1734               && STMT_VINFO_LIVE_P (stmt_info)
1735               && !PURE_SLP_STMT (stmt_info))
1736             ok = vectorizable_live_operation (loop_vinfo,
1737                                               stmt_info, NULL, NULL, NULL,
1738                                               -1, false, &cost_vec);
1739
1740           if (!ok)
1741             return opt_result::failure_at (phi,
1742                                            "not vectorized: relevant phi not "
1743                                            "supported: %G",
1744                                            static_cast <gimple *> (phi));
1745         }
1746
1747       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1748            gsi_next (&si))
1749         {
1750           gimple *stmt = gsi_stmt (si);
1751           if (!gimple_clobber_p (stmt)
1752               && !is_gimple_debug (stmt))
1753             {
1754               opt_result res
1755                 = vect_analyze_stmt (loop_vinfo,
1756                                      loop_vinfo->lookup_stmt (stmt),
1757                                      &need_to_vectorize,
1758                                      NULL, NULL, &cost_vec);
1759               if (!res)
1760                 return res;
1761             }
1762         }
1763     } /* bbs */
1764
1765   add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
1766
1767   /* All operations in the loop are either irrelevant (deal with loop
1768      control, or dead), or only used outside the loop and can be moved
1769      out of the loop (e.g. invariants, inductions).  The loop can be
1770      optimized away by scalar optimizations.  We're better off not
1771      touching this loop.  */
1772   if (!need_to_vectorize)
1773     {
1774       if (dump_enabled_p ())
1775         dump_printf_loc (MSG_NOTE, vect_location,
1776                          "All the computation can be taken out of the loop.\n");
1777       return opt_result::failure_at
1778         (vect_location,
1779          "not vectorized: redundant loop. no profit to vectorize.\n");
1780     }
1781
1782   return opt_result::success ();
1783 }
1784
1785 /* Return true if we know that the iteration count is smaller than the
1786    vectorization factor.  Return false if it isn't, or if we can't be sure
1787    either way.  */
1788
1789 static bool
1790 vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
1791 {
1792   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1793
1794   HOST_WIDE_INT max_niter;
1795   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1796     max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1797   else
1798     max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1799
1800   if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1801     return true;
1802
1803   return false;
1804 }
1805
1806 /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
1807    is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
1808    definitely no, or -1 if it's worth retrying.  */
1809
1810 static int
1811 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1812 {
1813   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1814   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1815
1816   /* Only loops that can handle partially-populated vectors can have iteration
1817      counts less than the vectorization factor.  */
1818   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
1819     {
1820       if (vect_known_niters_smaller_than_vf (loop_vinfo))
1821         {
1822           if (dump_enabled_p ())
1823             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1824                              "not vectorized: iteration count smaller than "
1825                              "vectorization factor.\n");
1826           return 0;
1827         }
1828     }
1829
1830   /* If using the "very cheap" model. reject cases in which we'd keep
1831      a copy of the scalar code (even if we might be able to vectorize it).  */
1832   if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
1833       && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1834           || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1835           || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)))
1836     {
1837       if (dump_enabled_p ())
1838         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1839                          "some scalar iterations would need to be peeled\n");
1840       return 0;
1841     }
1842
1843   int min_profitable_iters, min_profitable_estimate;
1844   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1845                                       &min_profitable_estimate);
1846
1847   if (min_profitable_iters < 0)
1848     {
1849       if (dump_enabled_p ())
1850         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1851                          "not vectorized: vectorization not profitable.\n");
1852       if (dump_enabled_p ())
1853         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1854                          "not vectorized: vector version will never be "
1855                          "profitable.\n");
1856       return -1;
1857     }
1858
1859   int min_scalar_loop_bound = (param_min_vect_loop_bound
1860                                * assumed_vf);
1861
1862   /* Use the cost model only if it is more conservative than user specified
1863      threshold.  */
1864   unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1865                                     min_profitable_iters);
1866
1867   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1868
1869   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1870       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1871     {
1872       if (dump_enabled_p ())
1873         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1874                          "not vectorized: vectorization not profitable.\n");
1875       if (dump_enabled_p ())
1876         dump_printf_loc (MSG_NOTE, vect_location,
1877                          "not vectorized: iteration count smaller than user "
1878                          "specified loop bound parameter or minimum profitable "
1879                          "iterations (whichever is more conservative).\n");
1880       return 0;
1881     }
1882
1883   /* The static profitablity threshold min_profitable_estimate includes
1884      the cost of having to check at runtime whether the scalar loop
1885      should be used instead.  If it turns out that we don't need or want
1886      such a check, the threshold we should use for the static estimate
1887      is simply the point at which the vector loop becomes more profitable
1888      than the scalar loop.  */
1889   if (min_profitable_estimate > min_profitable_iters
1890       && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
1891       && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
1892       && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1893       && !vect_apply_runtime_profitability_check_p (loop_vinfo))
1894     {
1895       if (dump_enabled_p ())
1896         dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
1897                          " choice between the scalar and vector loops\n");
1898       min_profitable_estimate = min_profitable_iters;
1899     }
1900
1901   /* If the vector loop needs multiple iterations to be beneficial then
1902      things are probably too close to call, and the conservative thing
1903      would be to stick with the scalar code.  */
1904   if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
1905       && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
1906     {
1907       if (dump_enabled_p ())
1908         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1909                          "one iteration of the vector loop would be"
1910                          " more expensive than the equivalent number of"
1911                          " iterations of the scalar loop\n");
1912       return 0;
1913     }
1914
1915   HOST_WIDE_INT estimated_niter;
1916
1917   /* If we are vectorizing an epilogue then we know the maximum number of
1918      scalar iterations it will cover is at least one lower than the
1919      vectorization factor of the main loop.  */
1920   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1921     estimated_niter
1922       = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
1923   else
1924     {
1925       estimated_niter = estimated_stmt_executions_int (loop);
1926       if (estimated_niter == -1)
1927         estimated_niter = likely_max_stmt_executions_int (loop);
1928     }
1929   if (estimated_niter != -1
1930       && ((unsigned HOST_WIDE_INT) estimated_niter
1931           < MAX (th, (unsigned) min_profitable_estimate)))
1932     {
1933       if (dump_enabled_p ())
1934         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1935                          "not vectorized: estimated iteration count too "
1936                          "small.\n");
1937       if (dump_enabled_p ())
1938         dump_printf_loc (MSG_NOTE, vect_location,
1939                          "not vectorized: estimated iteration count smaller "
1940                          "than specified loop bound parameter or minimum "
1941                          "profitable iterations (whichever is more "
1942                          "conservative).\n");
1943       return -1;
1944     }
1945
1946   return 1;
1947 }
1948
1949 static opt_result
1950 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1951                            vec<data_reference_p> *datarefs,
1952                            unsigned int *n_stmts)
1953 {
1954   *n_stmts = 0;
1955   for (unsigned i = 0; i < loop->num_nodes; i++)
1956     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1957          !gsi_end_p (gsi); gsi_next (&gsi))
1958       {
1959         gimple *stmt = gsi_stmt (gsi);
1960         if (is_gimple_debug (stmt))
1961           continue;
1962         ++(*n_stmts);
1963         opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
1964                                                         NULL, 0);
1965         if (!res)
1966           {
1967             if (is_gimple_call (stmt) && loop->safelen)
1968               {
1969                 tree fndecl = gimple_call_fndecl (stmt), op;
1970                 if (fndecl != NULL_TREE)
1971                   {
1972                     cgraph_node *node = cgraph_node::get (fndecl);
1973                     if (node != NULL && node->simd_clones != NULL)
1974                       {
1975                         unsigned int j, n = gimple_call_num_args (stmt);
1976                         for (j = 0; j < n; j++)
1977                           {
1978                             op = gimple_call_arg (stmt, j);
1979                             if (DECL_P (op)
1980                                 || (REFERENCE_CLASS_P (op)
1981                                     && get_base_address (op)))
1982                               break;
1983                           }
1984                         op = gimple_call_lhs (stmt);
1985                         /* Ignore #pragma omp declare simd functions
1986                            if they don't have data references in the
1987                            call stmt itself.  */
1988                         if (j == n
1989                             && !(op
1990                                  && (DECL_P (op)
1991                                      || (REFERENCE_CLASS_P (op)
1992                                          && get_base_address (op)))))
1993                           continue;
1994                       }
1995                   }
1996               }
1997             return res;
1998           }
1999         /* If dependence analysis will give up due to the limit on the
2000            number of datarefs stop here and fail fatally.  */
2001         if (datarefs->length ()
2002             > (unsigned)param_loop_max_datarefs_for_datadeps)
2003           return opt_result::failure_at (stmt, "exceeded param "
2004                                          "loop-max-datarefs-for-datadeps\n");
2005       }
2006   return opt_result::success ();
2007 }
2008
2009 /* Look for SLP-only access groups and turn each individual access into its own
2010    group.  */
2011 static void
2012 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
2013 {
2014   unsigned int i;
2015   struct data_reference *dr;
2016
2017   DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
2018
2019   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2020   FOR_EACH_VEC_ELT (datarefs, i, dr)
2021     {
2022       gcc_assert (DR_REF (dr));
2023       stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
2024
2025       /* Check if the load is a part of an interleaving chain.  */
2026       if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2027         {
2028           stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
2029           dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_element);
2030           unsigned int group_size = DR_GROUP_SIZE (first_element);
2031
2032           /* Check if SLP-only groups.  */
2033           if (!STMT_SLP_TYPE (stmt_info)
2034               && STMT_VINFO_SLP_VECT_ONLY (first_element))
2035             {
2036               /* Dissolve the group.  */
2037               STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
2038
2039               stmt_vec_info vinfo = first_element;
2040               while (vinfo)
2041                 {
2042                   stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
2043                   DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
2044                   DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2045                   DR_GROUP_SIZE (vinfo) = 1;
2046                   if (STMT_VINFO_STRIDED_P (first_element))
2047                     DR_GROUP_GAP (vinfo) = 0;
2048                   else
2049                     DR_GROUP_GAP (vinfo) = group_size - 1;
2050                   /* Duplicate and adjust alignment info, it needs to
2051                      be present on each group leader, see dr_misalignment.  */
2052                   if (vinfo != first_element)
2053                     {
2054                       dr_vec_info *dr_info2 = STMT_VINFO_DR_INFO (vinfo);
2055                       dr_info2->target_alignment = dr_info->target_alignment;
2056                       int misalignment = dr_info->misalignment;
2057                       if (misalignment != DR_MISALIGNMENT_UNKNOWN)
2058                         {
2059                           HOST_WIDE_INT diff
2060                             = (TREE_INT_CST_LOW (DR_INIT (dr_info2->dr))
2061                                - TREE_INT_CST_LOW (DR_INIT (dr_info->dr)));
2062                           unsigned HOST_WIDE_INT align_c
2063                             = dr_info->target_alignment.to_constant ();
2064                           misalignment = (misalignment + diff) % align_c;
2065                         }
2066                       dr_info2->misalignment = misalignment;
2067                     }
2068                   vinfo = next;
2069                 }
2070             }
2071         }
2072     }
2073 }
2074
2075 /* Determine if operating on full vectors for LOOP_VINFO might leave
2076    some scalar iterations still to do.  If so, decide how we should
2077    handle those scalar iterations.  The possibilities are:
2078
2079    (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
2080        In this case:
2081
2082          LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2083          LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2084          LOOP_VINFO_PEELING_FOR_NITER == false
2085
2086    (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2087        to handle the remaining scalar iterations.  In this case:
2088
2089          LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2090          LOOP_VINFO_PEELING_FOR_NITER == true
2091
2092        There are two choices:
2093
2094        (2a) Consider vectorizing the epilogue loop at the same VF as the
2095             main loop, but using partial vectors instead of full vectors.
2096             In this case:
2097
2098               LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true
2099
2100        (2b) Consider vectorizing the epilogue loop at lower VFs only.
2101             In this case:
2102
2103               LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2104
2105    When FOR_EPILOGUE_P is true, make this determination based on the
2106    assumption that LOOP_VINFO is an epilogue loop, otherwise make it
2107    based on the assumption that LOOP_VINFO is the main loop.  The caller
2108    has made sure that the number of iterations is set appropriately for
2109    this value of FOR_EPILOGUE_P.  */
2110
2111 opt_result
2112 vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo,
2113                                             bool for_epilogue_p)
2114 {
2115   /* Determine whether there would be any scalar iterations left over.  */
2116   bool need_peeling_or_partial_vectors_p
2117     = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
2118
2119   /* Decide whether to vectorize the loop with partial vectors.  */
2120   LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2121   LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2122   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2123       && need_peeling_or_partial_vectors_p)
2124     {
2125       /* For partial-vector-usage=1, try to push the handling of partial
2126          vectors to the epilogue, with the main loop continuing to operate
2127          on full vectors.
2128
2129          ??? We could then end up failing to use partial vectors if we
2130          decide to peel iterations into a prologue, and if the main loop
2131          then ends up processing fewer than VF iterations.  */
2132       if (param_vect_partial_vector_usage == 1
2133           && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2134           && !vect_known_niters_smaller_than_vf (loop_vinfo))
2135         LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2136       else
2137         LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2138     }
2139
2140   if (dump_enabled_p ())
2141     {
2142       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2143         dump_printf_loc (MSG_NOTE, vect_location,
2144                          "operating on partial vectors%s.\n",
2145                          for_epilogue_p ? " for epilogue loop" : "");
2146       else
2147         dump_printf_loc (MSG_NOTE, vect_location,
2148                          "operating only on full vectors%s.\n",
2149                          for_epilogue_p ? " for epilogue loop" : "");
2150     }
2151
2152   if (for_epilogue_p)
2153     {
2154       loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2155       gcc_assert (orig_loop_vinfo);
2156       if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2157         gcc_assert (known_lt (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2158                               LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)));
2159     }
2160
2161   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2162       && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2163     {
2164       /* Check that the loop processes at least one full vector.  */
2165       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2166       tree scalar_niters = LOOP_VINFO_NITERS (loop_vinfo);
2167       if (known_lt (wi::to_widest (scalar_niters), vf))
2168         return opt_result::failure_at (vect_location,
2169                                        "loop does not have enough iterations"
2170                                        " to support vectorization.\n");
2171
2172       /* If we need to peel an extra epilogue iteration to handle data
2173          accesses with gaps, check that there are enough scalar iterations
2174          available.
2175
2176          The check above is redundant with this one when peeling for gaps,
2177          but the distinction is useful for diagnostics.  */
2178       tree scalar_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
2179       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2180           && known_lt (wi::to_widest (scalar_nitersm1), vf))
2181         return opt_result::failure_at (vect_location,
2182                                        "loop does not have enough iterations"
2183                                        " to support peeling for gaps.\n");
2184     }
2185
2186   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2187     = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2188        && need_peeling_or_partial_vectors_p);
2189
2190   return opt_result::success ();
2191 }
2192
2193 /* Function vect_analyze_loop_2.
2194
2195    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2196    for it.  The different analyses will record information in the
2197    loop_vec_info struct.  */
2198 static opt_result
2199 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal)
2200 {
2201   opt_result ok = opt_result::success ();
2202   int res;
2203   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2204   poly_uint64 min_vf = 2;
2205   loop_vec_info orig_loop_vinfo = NULL;
2206
2207   /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2208      loop_vec_info of the first vectorized loop.  */
2209   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2210     orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2211   else
2212     orig_loop_vinfo = loop_vinfo;
2213   gcc_assert (orig_loop_vinfo);
2214
2215   /* The first group of checks is independent of the vector size.  */
2216   fatal = true;
2217
2218   if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2219       && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2220     return opt_result::failure_at (vect_location,
2221                                    "not vectorized: simd if(0)\n");
2222
2223   /* Find all data references in the loop (which correspond to vdefs/vuses)
2224      and analyze their evolution in the loop.  */
2225
2226   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2227
2228   /* Gather the data references and count stmts in the loop.  */
2229   if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2230     {
2231       opt_result res
2232         = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2233                                      &LOOP_VINFO_DATAREFS (loop_vinfo),
2234                                      &LOOP_VINFO_N_STMTS (loop_vinfo));
2235       if (!res)
2236         {
2237           if (dump_enabled_p ())
2238             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2239                              "not vectorized: loop contains function "
2240                              "calls or data references that cannot "
2241                              "be analyzed\n");
2242           return res;
2243         }
2244       loop_vinfo->shared->save_datarefs ();
2245     }
2246   else
2247     loop_vinfo->shared->check_datarefs ();
2248
2249   /* Analyze the data references and also adjust the minimal
2250      vectorization factor according to the loads and stores.  */
2251
2252   ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
2253   if (!ok)
2254     {
2255       if (dump_enabled_p ())
2256         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2257                          "bad data references.\n");
2258       return ok;
2259     }
2260
2261   /* Classify all cross-iteration scalar data-flow cycles.
2262      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
2263   vect_analyze_scalar_cycles (loop_vinfo);
2264
2265   vect_pattern_recog (loop_vinfo);
2266
2267   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2268
2269   /* Analyze the access patterns of the data-refs in the loop (consecutive,
2270      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
2271
2272   ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
2273   if (!ok)
2274     {
2275       if (dump_enabled_p ())
2276         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2277                          "bad data access.\n");
2278       return ok;
2279     }
2280
2281   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
2282
2283   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2284   if (!ok)
2285     {
2286       if (dump_enabled_p ())
2287         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2288                          "unexpected pattern.\n");
2289       return ok;
2290     }
2291
2292   /* While the rest of the analysis below depends on it in some way.  */
2293   fatal = false;
2294
2295   /* Analyze data dependences between the data-refs in the loop
2296      and adjust the maximum vectorization factor according to
2297      the dependences.
2298      FORNOW: fail at the first data dependence that we encounter.  */
2299
2300   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2301   if (!ok)
2302     {
2303       if (dump_enabled_p ())
2304         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2305                          "bad data dependence.\n");
2306       return ok;
2307     }
2308   if (max_vf != MAX_VECTORIZATION_FACTOR
2309       && maybe_lt (max_vf, min_vf))
2310     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2311   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2312
2313   ok = vect_determine_vectorization_factor (loop_vinfo);
2314   if (!ok)
2315     {
2316       if (dump_enabled_p ())
2317         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2318                          "can't determine vectorization factor.\n");
2319       return ok;
2320     }
2321   if (max_vf != MAX_VECTORIZATION_FACTOR
2322       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2323     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2324
2325   /* Compute the scalar iteration cost.  */
2326   vect_compute_single_scalar_iteration_cost (loop_vinfo);
2327
2328   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2329
2330   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
2331   ok = vect_analyze_slp (loop_vinfo, LOOP_VINFO_N_STMTS (loop_vinfo));
2332   if (!ok)
2333     return ok;
2334
2335   /* If there are any SLP instances mark them as pure_slp.  */
2336   bool slp = vect_make_slp_decision (loop_vinfo);
2337   if (slp)
2338     {
2339       /* Find stmts that need to be both vectorized and SLPed.  */
2340       vect_detect_hybrid_slp (loop_vinfo);
2341
2342       /* Update the vectorization factor based on the SLP decision.  */
2343       vect_update_vf_for_slp (loop_vinfo);
2344
2345       /* Optimize the SLP graph with the vectorization factor fixed.  */
2346       vect_optimize_slp (loop_vinfo);
2347
2348       /* Gather the loads reachable from the SLP graph entries.  */
2349       vect_gather_slp_loads (loop_vinfo);
2350     }
2351
2352   bool saved_can_use_partial_vectors_p
2353     = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2354
2355   /* We don't expect to have to roll back to anything other than an empty
2356      set of rgroups.  */
2357   gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2358
2359   /* This is the point where we can re-start analysis with SLP forced off.  */
2360 start_over:
2361
2362   /* Now the vectorization factor is final.  */
2363   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2364   gcc_assert (known_ne (vectorization_factor, 0U));
2365
2366   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2367     {
2368       dump_printf_loc (MSG_NOTE, vect_location,
2369                        "vectorization_factor = ");
2370       dump_dec (MSG_NOTE, vectorization_factor);
2371       dump_printf (MSG_NOTE, ", niters = %wd\n",
2372                    LOOP_VINFO_INT_NITERS (loop_vinfo));
2373     }
2374
2375   loop_vinfo->vector_costs = init_cost (loop_vinfo, false);
2376
2377   /* Analyze the alignment of the data-refs in the loop.
2378      Fail if a data reference is found that cannot be vectorized.  */
2379
2380   ok = vect_analyze_data_refs_alignment (loop_vinfo);
2381   if (!ok)
2382     {
2383       if (dump_enabled_p ())
2384         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2385                          "bad data alignment.\n");
2386       return ok;
2387     }
2388
2389   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2390      It is important to call pruning after vect_analyze_data_ref_accesses,
2391      since we use grouping information gathered by interleaving analysis.  */
2392   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2393   if (!ok)
2394     return ok;
2395
2396   /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2397      vectorization, since we do not want to add extra peeling or
2398      add versioning for alignment.  */
2399   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2400     /* This pass will decide on using loop versioning and/or loop peeling in
2401        order to enhance the alignment of data references in the loop.  */
2402     ok = vect_enhance_data_refs_alignment (loop_vinfo);
2403   if (!ok)
2404     return ok;
2405
2406   if (slp)
2407     {
2408       /* Analyze operations in the SLP instances.  Note this may
2409          remove unsupported SLP instances which makes the above
2410          SLP kind detection invalid.  */
2411       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2412       vect_slp_analyze_operations (loop_vinfo);
2413       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2414         {
2415           ok = opt_result::failure_at (vect_location,
2416                                        "unsupported SLP instances\n");
2417           goto again;
2418         }
2419
2420       /* Check whether any load in ALL SLP instances is possibly permuted.  */
2421       slp_tree load_node, slp_root;
2422       unsigned i, x;
2423       slp_instance instance;
2424       bool can_use_lanes = true;
2425       FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), x, instance)
2426         {
2427           slp_root = SLP_INSTANCE_TREE (instance);
2428           int group_size = SLP_TREE_LANES (slp_root);
2429           tree vectype = SLP_TREE_VECTYPE (slp_root);
2430           bool loads_permuted = false;
2431           FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2432             {
2433               if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
2434                 continue;
2435               unsigned j;
2436               stmt_vec_info load_info;
2437               FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2438                 if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
2439                   {
2440                     loads_permuted = true;
2441                     break;
2442                   }
2443             }
2444
2445           /* If the loads and stores can be handled with load/store-lane
2446              instructions record it and move on to the next instance.  */
2447           if (loads_permuted
2448               && SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2449               && vect_store_lanes_supported (vectype, group_size, false))
2450             {
2451               FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2452                 {
2453                   stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2454                       (SLP_TREE_SCALAR_STMTS (load_node)[0]);
2455                   /* Use SLP for strided accesses (or if we can't
2456                      load-lanes).  */
2457                   if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2458                       || ! vect_load_lanes_supported
2459                             (STMT_VINFO_VECTYPE (stmt_vinfo),
2460                              DR_GROUP_SIZE (stmt_vinfo), false))
2461                     break;
2462                 }
2463
2464               can_use_lanes
2465                 = can_use_lanes && i == SLP_INSTANCE_LOADS (instance).length ();
2466
2467               if (can_use_lanes && dump_enabled_p ())
2468                 dump_printf_loc (MSG_NOTE, vect_location,
2469                                  "SLP instance %p can use load/store-lanes\n",
2470                                  instance);
2471             }
2472           else
2473             {
2474               can_use_lanes = false;
2475               break;
2476             }
2477         }
2478
2479       /* If all SLP instances can use load/store-lanes abort SLP and try again
2480          with SLP disabled.  */
2481       if (can_use_lanes)
2482         {
2483           ok = opt_result::failure_at (vect_location,
2484                                        "Built SLP cancelled: can use "
2485                                        "load/store-lanes\n");
2486           if (dump_enabled_p ())
2487             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2488                              "Built SLP cancelled: all SLP instances support "
2489                              "load/store-lanes\n");
2490           goto again;
2491         }
2492     }
2493
2494   /* Dissolve SLP-only groups.  */
2495   vect_dissolve_slp_only_groups (loop_vinfo);
2496
2497   /* Scan all the remaining operations in the loop that are not subject
2498      to SLP and make sure they are vectorizable.  */
2499   ok = vect_analyze_loop_operations (loop_vinfo);
2500   if (!ok)
2501     {
2502       if (dump_enabled_p ())
2503         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2504                          "bad operation or unsupported loop bound.\n");
2505       return ok;
2506     }
2507
2508   /* For now, we don't expect to mix both masking and length approaches for one
2509      loop, disable it if both are recorded.  */
2510   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2511       && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
2512       && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
2513     {
2514       if (dump_enabled_p ())
2515         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2516                          "can't vectorize a loop with partial vectors"
2517                          " because we don't expect to mix different"
2518                          " approaches with partial vectors for the"
2519                          " same loop.\n");
2520       LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2521     }
2522
2523   /* If we still have the option of using partial vectors,
2524      check whether we can generate the necessary loop controls.  */
2525   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2526       && !vect_verify_full_masking (loop_vinfo)
2527       && !vect_verify_loop_lens (loop_vinfo))
2528     LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2529
2530   /* If we're vectorizing an epilogue loop, the vectorized loop either needs
2531      to be able to handle fewer than VF scalars, or needs to have a lower VF
2532      than the main loop.  */
2533   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2534       && !LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2535       && maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2536                    LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)))
2537     return opt_result::failure_at (vect_location,
2538                                    "Vectorization factor too high for"
2539                                    " epilogue loop.\n");
2540
2541   /* Decide whether this loop_vinfo should use partial vectors or peeling,
2542      assuming that the loop will be used as a main loop.  We will redo
2543      this analysis later if we instead decide to use the loop as an
2544      epilogue loop.  */
2545   ok = vect_determine_partial_vectors_and_peeling (loop_vinfo, false);
2546   if (!ok)
2547     return ok;
2548
2549   /* Check the costings of the loop make vectorizing worthwhile.  */
2550   res = vect_analyze_loop_costing (loop_vinfo);
2551   if (res < 0)
2552     {
2553       ok = opt_result::failure_at (vect_location,
2554                                    "Loop costings may not be worthwhile.\n");
2555       goto again;
2556     }
2557   if (!res)
2558     return opt_result::failure_at (vect_location,
2559                                    "Loop costings not worthwhile.\n");
2560
2561   /* If an epilogue loop is required make sure we can create one.  */
2562   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2563       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2564     {
2565       if (dump_enabled_p ())
2566         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2567       if (!vect_can_advance_ivs_p (loop_vinfo)
2568           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2569                                            single_exit (LOOP_VINFO_LOOP
2570                                                          (loop_vinfo))))
2571         {
2572           ok = opt_result::failure_at (vect_location,
2573                                        "not vectorized: can't create required "
2574                                        "epilog loop\n");
2575           goto again;
2576         }
2577     }
2578
2579   /* During peeling, we need to check if number of loop iterations is
2580      enough for both peeled prolog loop and vector loop.  This check
2581      can be merged along with threshold check of loop versioning, so
2582      increase threshold for this case if necessary.
2583
2584      If we are analyzing an epilogue we still want to check what its
2585      versioning threshold would be.  If we decide to vectorize the epilogues we
2586      will want to use the lowest versioning threshold of all epilogues and main
2587      loop.  This will enable us to enter a vectorized epilogue even when
2588      versioning the loop.  We can't simply check whether the epilogue requires
2589      versioning though since we may have skipped some versioning checks when
2590      analyzing the epilogue.  For instance, checks for alias versioning will be
2591      skipped when dealing with epilogues as we assume we already checked them
2592      for the main loop.  So instead we always check the 'orig_loop_vinfo'.  */
2593   if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
2594     {
2595       poly_uint64 niters_th = 0;
2596       unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2597
2598       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2599         {
2600           /* Niters for peeled prolog loop.  */
2601           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2602             {
2603               dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2604               tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2605               niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2606             }
2607           else
2608             niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2609         }
2610
2611       /* Niters for at least one iteration of vectorized loop.  */
2612       if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2613         niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2614       /* One additional iteration because of peeling for gap.  */
2615       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2616         niters_th += 1;
2617
2618       /*  Use the same condition as vect_transform_loop to decide when to use
2619           the cost to determine a versioning threshold.  */
2620       if (vect_apply_runtime_profitability_check_p (loop_vinfo)
2621           && ordered_p (th, niters_th))
2622         niters_th = ordered_max (poly_uint64 (th), niters_th);
2623
2624       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2625     }
2626
2627   gcc_assert (known_eq (vectorization_factor,
2628                         LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2629
2630   /* Ok to vectorize!  */
2631   LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2632   return opt_result::success ();
2633
2634 again:
2635   /* Ensure that "ok" is false (with an opt_problem if dumping is enabled).  */
2636   gcc_assert (!ok);
2637
2638   /* Try again with SLP forced off but if we didn't do any SLP there is
2639      no point in re-trying.  */
2640   if (!slp)
2641     return ok;
2642
2643   /* If there are reduction chains re-trying will fail anyway.  */
2644   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2645     return ok;
2646
2647   /* Likewise if the grouped loads or stores in the SLP cannot be handled
2648      via interleaving or lane instructions.  */
2649   slp_instance instance;
2650   slp_tree node;
2651   unsigned i, j;
2652   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2653     {
2654       stmt_vec_info vinfo;
2655       vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2656       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2657         continue;
2658       vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2659       unsigned int size = DR_GROUP_SIZE (vinfo);
2660       tree vectype = STMT_VINFO_VECTYPE (vinfo);
2661       if (! vect_store_lanes_supported (vectype, size, false)
2662          && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2663          && ! vect_grouped_store_supported (vectype, size))
2664         return opt_result::failure_at (vinfo->stmt,
2665                                        "unsupported grouped store\n");
2666       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2667         {
2668           vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
2669           vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2670           bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2671           size = DR_GROUP_SIZE (vinfo);
2672           vectype = STMT_VINFO_VECTYPE (vinfo);
2673           if (! vect_load_lanes_supported (vectype, size, false)
2674               && ! vect_grouped_load_supported (vectype, single_element_p,
2675                                                 size))
2676             return opt_result::failure_at (vinfo->stmt,
2677                                            "unsupported grouped load\n");
2678         }
2679     }
2680
2681   if (dump_enabled_p ())
2682     dump_printf_loc (MSG_NOTE, vect_location,
2683                      "re-trying with SLP disabled\n");
2684
2685   /* Roll back state appropriately.  No SLP this time.  */
2686   slp = false;
2687   /* Restore vectorization factor as it were without SLP.  */
2688   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2689   /* Free the SLP instances.  */
2690   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2691     vect_free_slp_instance (instance);
2692   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2693   /* Reset SLP type to loop_vect on all stmts.  */
2694   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2695     {
2696       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2697       for (gimple_stmt_iterator si = gsi_start_phis (bb);
2698            !gsi_end_p (si); gsi_next (&si))
2699         {
2700           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2701           STMT_SLP_TYPE (stmt_info) = loop_vect;
2702           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2703               || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
2704             {
2705               /* vectorizable_reduction adjusts reduction stmt def-types,
2706                  restore them to that of the PHI.  */
2707               STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
2708                 = STMT_VINFO_DEF_TYPE (stmt_info);
2709               STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
2710                                         (STMT_VINFO_REDUC_DEF (stmt_info)))
2711                 = STMT_VINFO_DEF_TYPE (stmt_info);
2712             }
2713         }
2714       for (gimple_stmt_iterator si = gsi_start_bb (bb);
2715            !gsi_end_p (si); gsi_next (&si))
2716         {
2717           if (is_gimple_debug (gsi_stmt (si)))
2718             continue;
2719           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2720           STMT_SLP_TYPE (stmt_info) = loop_vect;
2721           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2722             {
2723               stmt_vec_info pattern_stmt_info
2724                 = STMT_VINFO_RELATED_STMT (stmt_info);
2725               if (STMT_VINFO_SLP_VECT_ONLY_PATTERN (pattern_stmt_info))
2726                 STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
2727
2728               gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2729               STMT_SLP_TYPE (pattern_stmt_info) = loop_vect;
2730               for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2731                    !gsi_end_p (pi); gsi_next (&pi))
2732                 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2733                   = loop_vect;
2734             }
2735         }
2736     }
2737   /* Free optimized alias test DDRS.  */
2738   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2739   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2740   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2741   /* Reset target cost data.  */
2742   delete loop_vinfo->vector_costs;
2743   loop_vinfo->vector_costs = nullptr;
2744   /* Reset accumulated rgroup information.  */
2745   release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo));
2746   release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
2747   /* Reset assorted flags.  */
2748   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2749   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2750   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2751   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2752   LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2753     = saved_can_use_partial_vectors_p;
2754
2755   goto start_over;
2756 }
2757
2758 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
2759    to be better than vectorizing it using OLD_LOOP_VINFO.  Assume that
2760    OLD_LOOP_VINFO is better unless something specifically indicates
2761    otherwise.
2762
2763    Note that this deliberately isn't a partial order.  */
2764
2765 static bool
2766 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
2767                           loop_vec_info old_loop_vinfo)
2768 {
2769   struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
2770   gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
2771
2772   poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
2773   poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
2774
2775   /* Always prefer a VF of loop->simdlen over any other VF.  */
2776   if (loop->simdlen)
2777     {
2778       bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
2779       bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
2780       if (new_simdlen_p != old_simdlen_p)
2781         return new_simdlen_p;
2782     }
2783
2784   const auto *old_costs = old_loop_vinfo->vector_costs;
2785   const auto *new_costs = new_loop_vinfo->vector_costs;
2786   if (loop_vec_info main_loop = LOOP_VINFO_ORIG_LOOP_INFO (old_loop_vinfo))
2787     return new_costs->better_epilogue_loop_than_p (old_costs, main_loop);
2788
2789   return new_costs->better_main_loop_than_p (old_costs);
2790 }
2791
2792 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO.  Return
2793    true if we should.  */
2794
2795 static bool
2796 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
2797                         loop_vec_info old_loop_vinfo)
2798 {
2799   if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
2800     return false;
2801
2802   if (dump_enabled_p ())
2803     dump_printf_loc (MSG_NOTE, vect_location,
2804                      "***** Preferring vector mode %s to vector mode %s\n",
2805                      GET_MODE_NAME (new_loop_vinfo->vector_mode),
2806                      GET_MODE_NAME (old_loop_vinfo->vector_mode));
2807   return true;
2808 }
2809
2810 /* Analyze LOOP with VECTOR_MODES[MODE_I] and as epilogue if MAIN_LOOP_VINFO is
2811    not NULL.  Set AUTODETECTED_VECTOR_MODE if VOIDmode and advance
2812    MODE_I to the next mode useful to analyze.
2813    Return the loop_vinfo on success and wrapped null on failure.  */
2814
2815 static opt_loop_vec_info
2816 vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared,
2817                      const vect_loop_form_info *loop_form_info,
2818                      loop_vec_info main_loop_vinfo,
2819                      const vector_modes &vector_modes, unsigned &mode_i,
2820                      machine_mode &autodetected_vector_mode,
2821                      bool &fatal)
2822 {
2823   loop_vec_info loop_vinfo
2824     = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
2825
2826   machine_mode vector_mode = vector_modes[mode_i];
2827   loop_vinfo->vector_mode = vector_mode;
2828
2829   /* Run the main analysis.  */
2830   opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal);
2831   if (dump_enabled_p ())
2832     dump_printf_loc (MSG_NOTE, vect_location,
2833                      "***** Analysis %s with vector mode %s\n",
2834                      res ? "succeeded" : " failed",
2835                      GET_MODE_NAME (loop_vinfo->vector_mode));
2836
2837   /* Remember the autodetected vector mode.  */
2838   if (vector_mode == VOIDmode)
2839     autodetected_vector_mode = loop_vinfo->vector_mode;
2840
2841   /* Advance mode_i, first skipping modes that would result in the
2842      same analysis result.  */
2843   while (mode_i + 1 < vector_modes.length ()
2844          && vect_chooses_same_modes_p (loop_vinfo,
2845                                        vector_modes[mode_i + 1]))
2846     {
2847       if (dump_enabled_p ())
2848         dump_printf_loc (MSG_NOTE, vect_location,
2849                          "***** The result for vector mode %s would"
2850                          " be the same\n",
2851                          GET_MODE_NAME (vector_modes[mode_i + 1]));
2852       mode_i += 1;
2853     }
2854   if (mode_i + 1 < vector_modes.length ()
2855       && VECTOR_MODE_P (autodetected_vector_mode)
2856       && (related_vector_mode (vector_modes[mode_i + 1],
2857                                GET_MODE_INNER (autodetected_vector_mode))
2858           == autodetected_vector_mode)
2859       && (related_vector_mode (autodetected_vector_mode,
2860                                GET_MODE_INNER (vector_modes[mode_i + 1]))
2861           == vector_modes[mode_i + 1]))
2862     {
2863       if (dump_enabled_p ())
2864         dump_printf_loc (MSG_NOTE, vect_location,
2865                          "***** Skipping vector mode %s, which would"
2866                          " repeat the analysis for %s\n",
2867                          GET_MODE_NAME (vector_modes[mode_i + 1]),
2868                          GET_MODE_NAME (autodetected_vector_mode));
2869       mode_i += 1;
2870     }
2871   mode_i++;
2872
2873   if (!res)
2874     {
2875       delete loop_vinfo;
2876       if (fatal)
2877         gcc_checking_assert (main_loop_vinfo == NULL);
2878       return opt_loop_vec_info::propagate_failure (res);
2879     }
2880
2881   return opt_loop_vec_info::success (loop_vinfo);
2882 }
2883
2884 /* Function vect_analyze_loop.
2885
2886    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2887    for it.  The different analyses will record information in the
2888    loop_vec_info struct.  */
2889 opt_loop_vec_info
2890 vect_analyze_loop (class loop *loop, vec_info_shared *shared)
2891 {
2892   DUMP_VECT_SCOPE ("analyze_loop_nest");
2893
2894   if (loop_outer (loop)
2895       && loop_vec_info_for_loop (loop_outer (loop))
2896       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2897     return opt_loop_vec_info::failure_at (vect_location,
2898                                           "outer-loop already vectorized.\n");
2899
2900   if (!find_loop_nest (loop, &shared->loop_nest))
2901     return opt_loop_vec_info::failure_at
2902       (vect_location,
2903        "not vectorized: loop nest containing two or more consecutive inner"
2904        " loops cannot be vectorized\n");
2905
2906   /* Analyze the loop form.  */
2907   vect_loop_form_info loop_form_info;
2908   opt_result res = vect_analyze_loop_form (loop, &loop_form_info);
2909   if (!res)
2910     {
2911       if (dump_enabled_p ())
2912         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2913                          "bad loop form.\n");
2914       return opt_loop_vec_info::propagate_failure (res);
2915     }
2916   if (!integer_onep (loop_form_info.assumptions))
2917     {
2918       /* We consider to vectorize this loop by versioning it under
2919          some assumptions.  In order to do this, we need to clear
2920          existing information computed by scev and niter analyzer.  */
2921       scev_reset_htab ();
2922       free_numbers_of_iterations_estimates (loop);
2923       /* Also set flag for this loop so that following scev and niter
2924          analysis are done under the assumptions.  */
2925       loop_constraint_set (loop, LOOP_C_FINITE);
2926     }
2927
2928   auto_vector_modes vector_modes;
2929   /* Autodetect first vector size we try.  */
2930   vector_modes.safe_push (VOIDmode);
2931   unsigned int autovec_flags
2932     = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
2933                                                     loop->simdlen != 0);
2934   bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
2935                              && !unlimited_cost_model (loop));
2936   machine_mode autodetected_vector_mode = VOIDmode;
2937   opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
2938   unsigned int mode_i = 0;
2939   unsigned int first_loop_i = 0;
2940   unsigned int first_loop_next_i = 0;
2941   unsigned HOST_WIDE_INT simdlen = loop->simdlen;
2942
2943   /* First determine the main loop vectorization mode, either the first
2944      one that works, starting with auto-detecting the vector mode and then
2945      following the targets order of preference, or the one with the
2946      lowest cost if pick_lowest_cost_p.  */
2947   while (1)
2948     {
2949       unsigned int loop_vinfo_i = mode_i;
2950       bool fatal;
2951       opt_loop_vec_info loop_vinfo
2952         = vect_analyze_loop_1 (loop, shared, &loop_form_info,
2953                                NULL, vector_modes, mode_i,
2954                                autodetected_vector_mode, fatal);
2955       if (fatal)
2956         break;
2957
2958       if (loop_vinfo)
2959         {
2960           /* Once we hit the desired simdlen for the first time,
2961              discard any previous attempts.  */
2962           if (simdlen
2963               && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
2964             {
2965               delete first_loop_vinfo;
2966               first_loop_vinfo = opt_loop_vec_info::success (NULL);
2967               simdlen = 0;
2968             }
2969           else if (pick_lowest_cost_p
2970                    && first_loop_vinfo
2971                    && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
2972             {
2973               /* Pick loop_vinfo over first_loop_vinfo.  */
2974               delete first_loop_vinfo;
2975               first_loop_vinfo = opt_loop_vec_info::success (NULL);
2976             }
2977           if (first_loop_vinfo == NULL)
2978             {
2979               first_loop_vinfo = loop_vinfo;
2980               first_loop_i = loop_vinfo_i;
2981               first_loop_next_i = mode_i;
2982             }
2983           else
2984             {
2985               delete loop_vinfo;
2986               loop_vinfo = opt_loop_vec_info::success (NULL);
2987             }
2988
2989           /* Commit to first_loop_vinfo if we have no reason to try
2990              alternatives.  */
2991           if (!simdlen && !pick_lowest_cost_p)
2992             break;
2993         }
2994       if (mode_i == vector_modes.length ()
2995           || autodetected_vector_mode == VOIDmode)
2996         break;
2997
2998       /* Try the next biggest vector size.  */
2999       if (dump_enabled_p ())
3000         dump_printf_loc (MSG_NOTE, vect_location,
3001                          "***** Re-trying analysis with vector mode %s\n",
3002                          GET_MODE_NAME (vector_modes[mode_i]));
3003     }
3004   if (!first_loop_vinfo)
3005     return opt_loop_vec_info::propagate_failure (res);
3006
3007   if (dump_enabled_p ())
3008     dump_printf_loc (MSG_NOTE, vect_location,
3009                      "***** Choosing vector mode %s\n",
3010                      GET_MODE_NAME (first_loop_vinfo->vector_mode));
3011
3012   /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
3013      enabled, SIMDUID is not set, it is the innermost loop and we have
3014      either already found the loop's SIMDLEN or there was no SIMDLEN to
3015      begin with.
3016      TODO: Enable epilogue vectorization for loops with SIMDUID set.  */
3017   bool vect_epilogues = (!simdlen
3018                          && loop->inner == NULL
3019                          && param_vect_epilogues_nomask
3020                          && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
3021                          && !loop->simduid);
3022   if (!vect_epilogues)
3023     return first_loop_vinfo;
3024
3025   /* Now analyze first_loop_vinfo for epilogue vectorization.  */
3026   poly_uint64 lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
3027
3028   /* Handle the case that the original loop can use partial
3029      vectorization, but want to only adopt it for the epilogue.
3030      The retry should be in the same mode as original.  */
3031   if (LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (first_loop_vinfo))
3032     {
3033       gcc_assert (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (first_loop_vinfo)
3034                   && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (first_loop_vinfo));
3035       if (dump_enabled_p ())
3036         dump_printf_loc (MSG_NOTE, vect_location,
3037                          "***** Re-trying analysis with same vector mode"
3038                          " %s for epilogue with partial vectors.\n",
3039                          GET_MODE_NAME (first_loop_vinfo->vector_mode));
3040       mode_i = first_loop_i;
3041     }
3042   else
3043     {
3044       mode_i = first_loop_next_i;
3045       if (mode_i == vector_modes.length ())
3046         return first_loop_vinfo;
3047     }
3048
3049   /* ???  If first_loop_vinfo was using VOIDmode then we probably
3050      want to instead search for the corresponding mode in vector_modes[].  */
3051
3052   while (1)
3053     {
3054       bool fatal;
3055       opt_loop_vec_info loop_vinfo
3056         = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3057                                first_loop_vinfo,
3058                                vector_modes, mode_i,
3059                                autodetected_vector_mode, fatal);
3060       if (fatal)
3061         break;
3062
3063       if (loop_vinfo)
3064         {
3065           if (pick_lowest_cost_p)
3066             {
3067               /* Keep trying to roll back vectorization attempts while the
3068                  loop_vec_infos they produced were worse than this one.  */
3069               vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
3070               while (!vinfos.is_empty ()
3071                      && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
3072                 {
3073                   gcc_assert (vect_epilogues);
3074                   delete vinfos.pop ();
3075                 }
3076             }
3077           /* For now only allow one epilogue loop.  */
3078           if (first_loop_vinfo->epilogue_vinfos.is_empty ())
3079             {
3080               first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
3081               poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3082               gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
3083                           || maybe_ne (lowest_th, 0U));
3084               /* Keep track of the known smallest versioning
3085                  threshold.  */
3086               if (ordered_p (lowest_th, th))
3087                 lowest_th = ordered_min (lowest_th, th);
3088             }
3089           else
3090             {
3091               delete loop_vinfo;
3092               loop_vinfo = opt_loop_vec_info::success (NULL);
3093             }
3094
3095           /* For now only allow one epilogue loop, but allow
3096              pick_lowest_cost_p to replace it, so commit to the
3097              first epilogue if we have no reason to try alternatives.  */
3098           if (!pick_lowest_cost_p)
3099             break;
3100         }
3101
3102       if (mode_i == vector_modes.length ())
3103         break;
3104
3105       /* Try the next biggest vector size.  */
3106       if (dump_enabled_p ())
3107         dump_printf_loc (MSG_NOTE, vect_location,
3108                          "***** Re-trying epilogue analysis with vector "
3109                          "mode %s\n", GET_MODE_NAME (vector_modes[mode_i]));
3110     }
3111
3112   if (!first_loop_vinfo->epilogue_vinfos.is_empty ())
3113     {
3114       LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
3115       if (dump_enabled_p ())
3116         dump_printf_loc (MSG_NOTE, vect_location,
3117                          "***** Choosing epilogue vector mode %s\n",
3118                          GET_MODE_NAME
3119                            (first_loop_vinfo->epilogue_vinfos[0]->vector_mode));
3120     }
3121
3122   return first_loop_vinfo;
3123 }
3124
3125 /* Return true if there is an in-order reduction function for CODE, storing
3126    it in *REDUC_FN if so.  */
3127
3128 static bool
3129 fold_left_reduction_fn (code_helper code, internal_fn *reduc_fn)
3130 {
3131   if (code == PLUS_EXPR)
3132     {
3133       *reduc_fn = IFN_FOLD_LEFT_PLUS;
3134       return true;
3135     }
3136   return false;
3137 }
3138
3139 /* Function reduction_fn_for_scalar_code
3140
3141    Input:
3142    CODE - tree_code of a reduction operations.
3143
3144    Output:
3145    REDUC_FN - the corresponding internal function to be used to reduce the
3146       vector of partial results into a single scalar result, or IFN_LAST
3147       if the operation is a supported reduction operation, but does not have
3148       such an internal function.
3149
3150    Return FALSE if CODE currently cannot be vectorized as reduction.  */
3151
3152 bool
3153 reduction_fn_for_scalar_code (code_helper code, internal_fn *reduc_fn)
3154 {
3155   if (code.is_tree_code ())
3156     switch (tree_code (code))
3157       {
3158       case MAX_EXPR:
3159         *reduc_fn = IFN_REDUC_MAX;
3160         return true;
3161
3162       case MIN_EXPR:
3163         *reduc_fn = IFN_REDUC_MIN;
3164         return true;
3165
3166       case PLUS_EXPR:
3167         *reduc_fn = IFN_REDUC_PLUS;
3168         return true;
3169
3170       case BIT_AND_EXPR:
3171         *reduc_fn = IFN_REDUC_AND;
3172         return true;
3173
3174       case BIT_IOR_EXPR:
3175         *reduc_fn = IFN_REDUC_IOR;
3176         return true;
3177
3178       case BIT_XOR_EXPR:
3179         *reduc_fn = IFN_REDUC_XOR;
3180         return true;
3181
3182       case MULT_EXPR:
3183       case MINUS_EXPR:
3184         *reduc_fn = IFN_LAST;
3185         return true;
3186
3187       default:
3188         return false;
3189       }
3190   else
3191     switch (combined_fn (code))
3192       {
3193       CASE_CFN_FMAX:
3194         *reduc_fn = IFN_REDUC_FMAX;
3195         return true;
3196
3197       CASE_CFN_FMIN:
3198         *reduc_fn = IFN_REDUC_FMIN;
3199         return true;
3200
3201       default:
3202         return false;
3203       }
3204 }
3205
3206 /* If there is a neutral value X such that a reduction would not be affected
3207    by the introduction of additional X elements, return that X, otherwise
3208    return null.  CODE is the code of the reduction and SCALAR_TYPE is type
3209    of the scalar elements.  If the reduction has just a single initial value
3210    then INITIAL_VALUE is that value, otherwise it is null.  */
3211
3212 tree
3213 neutral_op_for_reduction (tree scalar_type, code_helper code,
3214                           tree initial_value)
3215 {
3216   if (code.is_tree_code ())
3217     switch (tree_code (code))
3218       {
3219       case WIDEN_SUM_EXPR:
3220       case DOT_PROD_EXPR:
3221       case SAD_EXPR:
3222       case PLUS_EXPR:
3223       case MINUS_EXPR:
3224       case BIT_IOR_EXPR:
3225       case BIT_XOR_EXPR:
3226         return build_zero_cst (scalar_type);
3227
3228       case MULT_EXPR:
3229         return build_one_cst (scalar_type);
3230
3231       case BIT_AND_EXPR:
3232         return build_all_ones_cst (scalar_type);
3233
3234       case MAX_EXPR:
3235       case MIN_EXPR:
3236         return initial_value;
3237
3238       default:
3239         return NULL_TREE;
3240       }
3241   else
3242     switch (combined_fn (code))
3243       {
3244       CASE_CFN_FMIN:
3245       CASE_CFN_FMAX:
3246         return initial_value;
3247
3248       default:
3249         return NULL_TREE;
3250       }
3251 }
3252
3253 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
3254    STMT is printed with a message MSG. */
3255
3256 static void
3257 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3258 {
3259   dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3260 }
3261
3262 /* Return true if we need an in-order reduction for operation CODE
3263    on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3264    overflow must wrap.  */
3265
3266 bool
3267 needs_fold_left_reduction_p (tree type, code_helper code)
3268 {
3269   /* CHECKME: check for !flag_finite_math_only too?  */
3270   if (SCALAR_FLOAT_TYPE_P (type))
3271     {
3272       if (code.is_tree_code ())
3273         switch (tree_code (code))
3274           {
3275           case MIN_EXPR:
3276           case MAX_EXPR:
3277             return false;
3278
3279           default:
3280             return !flag_associative_math;
3281           }
3282       else
3283         switch (combined_fn (code))
3284           {
3285           CASE_CFN_FMIN:
3286           CASE_CFN_FMAX:
3287             return false;
3288
3289           default:
3290             return !flag_associative_math;
3291           }
3292     }
3293
3294   if (INTEGRAL_TYPE_P (type))
3295     return (!code.is_tree_code ()
3296             || !operation_no_trapping_overflow (type, tree_code (code)));
3297
3298   if (SAT_FIXED_POINT_TYPE_P (type))
3299     return true;
3300
3301   return false;
3302 }
3303
3304 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3305    has a handled computation expression.  Store the main reduction
3306    operation in *CODE.  */
3307
3308 static bool
3309 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3310                       tree loop_arg, code_helper *code,
3311                       vec<std::pair<ssa_op_iter, use_operand_p> > &path)
3312 {
3313   auto_bitmap visited;
3314   tree lookfor = PHI_RESULT (phi);
3315   ssa_op_iter curri;
3316   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
3317   while (USE_FROM_PTR (curr) != loop_arg)
3318     curr = op_iter_next_use (&curri);
3319   curri.i = curri.numops;
3320   do
3321     {
3322       path.safe_push (std::make_pair (curri, curr));
3323       tree use = USE_FROM_PTR (curr);
3324       if (use == lookfor)
3325         break;
3326       gimple *def = SSA_NAME_DEF_STMT (use);
3327       if (gimple_nop_p (def)
3328           || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3329         {
3330 pop:
3331           do
3332             {
3333               std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3334               curri = x.first;
3335               curr = x.second;
3336               do
3337                 curr = op_iter_next_use (&curri);
3338               /* Skip already visited or non-SSA operands (from iterating
3339                  over PHI args).  */
3340               while (curr != NULL_USE_OPERAND_P
3341                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3342                          || ! bitmap_set_bit (visited,
3343                                               SSA_NAME_VERSION
3344                                                 (USE_FROM_PTR (curr)))));
3345             }
3346           while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
3347           if (curr == NULL_USE_OPERAND_P)
3348             break;
3349         }
3350       else
3351         {
3352           if (gimple_code (def) == GIMPLE_PHI)
3353             curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
3354           else
3355             curr = op_iter_init_use (&curri, def, SSA_OP_USE);
3356           while (curr != NULL_USE_OPERAND_P
3357                  && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3358                      || ! bitmap_set_bit (visited,
3359                                           SSA_NAME_VERSION
3360                                             (USE_FROM_PTR (curr)))))
3361             curr = op_iter_next_use (&curri);
3362           if (curr == NULL_USE_OPERAND_P)
3363             goto pop;
3364         }
3365     }
3366   while (1);
3367   if (dump_file && (dump_flags & TDF_DETAILS))
3368     {
3369       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
3370       unsigned i;
3371       std::pair<ssa_op_iter, use_operand_p> *x;
3372       FOR_EACH_VEC_ELT (path, i, x)
3373         dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
3374       dump_printf (MSG_NOTE, "\n");
3375     }
3376
3377   /* Check whether the reduction path detected is valid.  */
3378   bool fail = path.length () == 0;
3379   bool neg = false;
3380   int sign = -1;
3381   *code = ERROR_MARK;
3382   for (unsigned i = 1; i < path.length (); ++i)
3383     {
3384       gimple *use_stmt = USE_STMT (path[i].second);
3385       gimple_match_op op;
3386       if (!gimple_extract_op (use_stmt, &op))
3387         {
3388           fail = true;
3389           break;
3390         }
3391       unsigned int opi = op.num_ops;
3392       if (gassign *assign = dyn_cast<gassign *> (use_stmt))
3393         {
3394           /* The following make sure we can compute the operand index
3395              easily plus it mostly disallows chaining via COND_EXPR condition
3396              operands.  */
3397           for (opi = 0; opi < op.num_ops; ++opi)
3398             if (gimple_assign_rhs1_ptr (assign) + opi == path[i].second->use)
3399               break;
3400         }
3401       else if (gcall *call = dyn_cast<gcall *> (use_stmt))
3402         {
3403           for (opi = 0; opi < op.num_ops; ++opi)
3404             if (gimple_call_arg_ptr (call, opi) == path[i].second->use)
3405               break;
3406         }
3407       if (opi == op.num_ops)
3408         {
3409           fail = true;
3410           break;
3411         }
3412       op.code = canonicalize_code (op.code, op.type);
3413       if (op.code == MINUS_EXPR)
3414         {
3415           op.code = PLUS_EXPR;
3416           /* Track whether we negate the reduction value each iteration.  */
3417           if (op.ops[1] == op.ops[opi])
3418             neg = ! neg;
3419         }
3420       if (CONVERT_EXPR_CODE_P (op.code)
3421           && tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
3422         ;
3423       else if (*code == ERROR_MARK)
3424         {
3425           *code = op.code;
3426           sign = TYPE_SIGN (op.type);
3427         }
3428       else if (op.code != *code)
3429         {
3430           fail = true;
3431           break;
3432         }
3433       else if ((op.code == MIN_EXPR
3434                 || op.code == MAX_EXPR)
3435                && sign != TYPE_SIGN (op.type))
3436         {
3437           fail = true;
3438           break;
3439         }
3440       /* Check there's only a single stmt the op is used on.  For the
3441          not value-changing tail and the last stmt allow out-of-loop uses.
3442          ???  We could relax this and handle arbitrary live stmts by
3443          forcing a scalar epilogue for example.  */
3444       imm_use_iterator imm_iter;
3445       gimple *op_use_stmt;
3446       unsigned cnt = 0;
3447       FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op.ops[opi])
3448         if (!is_gimple_debug (op_use_stmt)
3449             && (*code != ERROR_MARK
3450                 || flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt))))
3451           {
3452             /* We want to allow x + x but not x < 1 ? x : 2.  */
3453             if (is_gimple_assign (op_use_stmt)
3454                 && gimple_assign_rhs_code (op_use_stmt) == COND_EXPR)
3455               {
3456                 use_operand_p use_p;
3457                 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
3458                   cnt++;
3459               }
3460             else
3461               cnt++;
3462           }
3463       if (cnt != 1)
3464         {
3465           fail = true;
3466           break;
3467         }
3468     }
3469   return ! fail && ! neg && *code != ERROR_MARK;
3470 }
3471
3472 bool
3473 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3474                       tree loop_arg, enum tree_code code)
3475 {
3476   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3477   code_helper code_;
3478   return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
3479           && code_ == code);
3480 }
3481
3482
3483
3484 /* Function vect_is_simple_reduction
3485
3486    (1) Detect a cross-iteration def-use cycle that represents a simple
3487    reduction computation.  We look for the following pattern:
3488
3489    loop_header:
3490      a1 = phi < a0, a2 >
3491      a3 = ...
3492      a2 = operation (a3, a1)
3493
3494    or
3495
3496    a3 = ...
3497    loop_header:
3498      a1 = phi < a0, a2 >
3499      a2 = operation (a3, a1)
3500
3501    such that:
3502    1. operation is commutative and associative and it is safe to
3503       change the order of the computation
3504    2. no uses for a2 in the loop (a2 is used out of the loop)
3505    3. no uses of a1 in the loop besides the reduction operation
3506    4. no uses of a1 outside the loop.
3507
3508    Conditions 1,4 are tested here.
3509    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
3510
3511    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
3512    nested cycles.
3513
3514    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
3515    reductions:
3516
3517      a1 = phi < a0, a2 >
3518      inner loop (def of a3)
3519      a2 = phi < a3 >
3520
3521    (4) Detect condition expressions, ie:
3522      for (int i = 0; i < N; i++)
3523        if (a[i] < val)
3524         ret_val = a[i];
3525
3526 */
3527
3528 static stmt_vec_info
3529 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3530                           bool *double_reduc, bool *reduc_chain_p)
3531 {
3532   gphi *phi = as_a <gphi *> (phi_info->stmt);
3533   gimple *phi_use_stmt = NULL;
3534   imm_use_iterator imm_iter;
3535   use_operand_p use_p;
3536
3537   *double_reduc = false;
3538   *reduc_chain_p = false;
3539   STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
3540
3541   tree phi_name = PHI_RESULT (phi);
3542   /* ???  If there are no uses of the PHI result the inner loop reduction
3543      won't be detected as possibly double-reduction by vectorizable_reduction
3544      because that tries to walk the PHI arg from the preheader edge which
3545      can be constant.  See PR60382.  */
3546   if (has_zero_uses (phi_name))
3547     return NULL;
3548   class loop *loop = (gimple_bb (phi))->loop_father;
3549   unsigned nphi_def_loop_uses = 0;
3550   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
3551     {
3552       gimple *use_stmt = USE_STMT (use_p);
3553       if (is_gimple_debug (use_stmt))
3554         continue;
3555
3556       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3557         {
3558           if (dump_enabled_p ())
3559             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3560                              "intermediate value used outside loop.\n");
3561
3562           return NULL;
3563         }
3564
3565       nphi_def_loop_uses++;
3566       phi_use_stmt = use_stmt;
3567     }
3568
3569   tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
3570   if (TREE_CODE (latch_def) != SSA_NAME)
3571     {
3572       if (dump_enabled_p ())
3573         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3574                          "reduction: not ssa_name: %T\n", latch_def);
3575       return NULL;
3576     }
3577
3578   stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
3579   if (!def_stmt_info
3580       || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
3581     return NULL;
3582
3583   bool nested_in_vect_loop
3584     = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
3585   unsigned nlatch_def_loop_uses = 0;
3586   auto_vec<gphi *, 3> lcphis;
3587   bool inner_loop_of_double_reduc = false;
3588   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
3589     {
3590       gimple *use_stmt = USE_STMT (use_p);
3591       if (is_gimple_debug (use_stmt))
3592         continue;
3593       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3594         nlatch_def_loop_uses++;
3595       else
3596         {
3597           /* We can have more than one loop-closed PHI.  */
3598           lcphis.safe_push (as_a <gphi *> (use_stmt));
3599           if (nested_in_vect_loop
3600               && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
3601                   == vect_double_reduction_def))
3602             inner_loop_of_double_reduc = true;
3603         }
3604     }
3605
3606   /* If we are vectorizing an inner reduction we are executing that
3607      in the original order only in case we are not dealing with a
3608      double reduction.  */
3609   if (nested_in_vect_loop && !inner_loop_of_double_reduc)
3610     {
3611       if (dump_enabled_p ())
3612         report_vect_op (MSG_NOTE, def_stmt_info->stmt,
3613                         "detected nested cycle: ");
3614       return def_stmt_info;
3615     }
3616
3617   /* When the inner loop of a double reduction ends up with more than
3618      one loop-closed PHI we have failed to classify alternate such
3619      PHIs as double reduction, leading to wrong code.  See PR103237.  */
3620   if (inner_loop_of_double_reduc && lcphis.length () != 1)
3621     {
3622       if (dump_enabled_p ())
3623         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3624                          "unhandle double reduction\n");
3625       return NULL;
3626     }
3627
3628   /* If this isn't a nested cycle or if the nested cycle reduction value
3629      is used ouside of the inner loop we cannot handle uses of the reduction
3630      value.  */
3631   if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
3632     {
3633       if (dump_enabled_p ())
3634         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3635                          "reduction used in loop.\n");
3636       return NULL;
3637     }
3638
3639   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3640      defined in the inner loop.  */
3641   if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
3642     {
3643       tree op1 = PHI_ARG_DEF (def_stmt, 0);
3644       if (gimple_phi_num_args (def_stmt) != 1
3645           || TREE_CODE (op1) != SSA_NAME)
3646         {
3647           if (dump_enabled_p ())
3648             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3649                              "unsupported phi node definition.\n");
3650
3651           return NULL;
3652         }
3653
3654       gimple *def1 = SSA_NAME_DEF_STMT (op1);
3655       if (gimple_bb (def1)
3656           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3657           && loop->inner
3658           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3659           && (is_gimple_assign (def1) || is_gimple_call (def1))
3660           && is_a <gphi *> (phi_use_stmt)
3661           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3662         {
3663           if (dump_enabled_p ())
3664             report_vect_op (MSG_NOTE, def_stmt,
3665                             "detected double reduction: ");
3666
3667           *double_reduc = true;
3668           return def_stmt_info;
3669         }
3670
3671       return NULL;
3672     }
3673
3674   /* Look for the expression computing latch_def from then loop PHI result.  */
3675   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3676   code_helper code;
3677   if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
3678                             path))
3679     {
3680       STMT_VINFO_REDUC_CODE (phi_info) = code;
3681       if (code == COND_EXPR && !nested_in_vect_loop)
3682         STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
3683
3684       /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
3685          reduction chain for which the additional restriction is that
3686          all operations in the chain are the same.  */
3687       auto_vec<stmt_vec_info, 8> reduc_chain;
3688       unsigned i;
3689       bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
3690       for (i = path.length () - 1; i >= 1; --i)
3691         {
3692           gimple *stmt = USE_STMT (path[i].second);
3693           stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
3694           gimple_match_op op;
3695           if (!gimple_extract_op (stmt, &op))
3696             gcc_unreachable ();
3697           if (gassign *assign = dyn_cast<gassign *> (stmt))
3698             STMT_VINFO_REDUC_IDX (stmt_info)
3699               = path[i].second->use - gimple_assign_rhs1_ptr (assign);
3700           else
3701             {
3702               gcall *call = as_a<gcall *> (stmt);
3703               STMT_VINFO_REDUC_IDX (stmt_info)
3704                 = path[i].second->use - gimple_call_arg_ptr (call, 0);
3705             }
3706           bool leading_conversion = (CONVERT_EXPR_CODE_P (op.code)
3707                                      && (i == 1 || i == path.length () - 1));
3708           if ((op.code != code && !leading_conversion)
3709               /* We can only handle the final value in epilogue
3710                  generation for reduction chains.  */
3711               || (i != 1 && !has_single_use (gimple_get_lhs (stmt))))
3712             is_slp_reduc = false;
3713           /* For reduction chains we support a trailing/leading
3714              conversions.  We do not store those in the actual chain.  */
3715           if (leading_conversion)
3716             continue;
3717           reduc_chain.safe_push (stmt_info);
3718         }
3719       if (is_slp_reduc && reduc_chain.length () > 1)
3720         {
3721           for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
3722             {
3723               REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
3724               REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
3725             }
3726           REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
3727           REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
3728
3729           /* Save the chain for further analysis in SLP detection.  */
3730           LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
3731           REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
3732
3733           *reduc_chain_p = true;
3734           if (dump_enabled_p ())
3735             dump_printf_loc (MSG_NOTE, vect_location,
3736                             "reduction: detected reduction chain\n");
3737         }
3738       else if (dump_enabled_p ())
3739         dump_printf_loc (MSG_NOTE, vect_location,
3740                          "reduction: detected reduction\n");
3741
3742       return def_stmt_info;
3743     }
3744
3745   if (dump_enabled_p ())
3746     dump_printf_loc (MSG_NOTE, vect_location,
3747                      "reduction: unknown pattern\n");
3748
3749   return NULL;
3750 }
3751
3752 /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
3753    PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
3754    or -1 if not known.  */
3755
3756 static int
3757 vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
3758 {
3759   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3760   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
3761     {
3762       if (dump_enabled_p ())
3763         dump_printf_loc (MSG_NOTE, vect_location,
3764                          "cost model: epilogue peel iters set to vf/2 "
3765                          "because loop iterations are unknown .\n");
3766       return assumed_vf / 2;
3767     }
3768   else
3769     {
3770       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3771       peel_iters_prologue = MIN (niters, peel_iters_prologue);
3772       int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3773       /* If we need to peel for gaps, but no peeling is required, we have to
3774          peel VF iterations.  */
3775       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
3776         peel_iters_epilogue = assumed_vf;
3777       return peel_iters_epilogue;
3778     }
3779 }
3780
3781 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
3782 int
3783 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3784                              int *peel_iters_epilogue,
3785                              stmt_vector_for_cost *scalar_cost_vec,
3786                              stmt_vector_for_cost *prologue_cost_vec,
3787                              stmt_vector_for_cost *epilogue_cost_vec)
3788 {
3789   int retval = 0;
3790
3791   *peel_iters_epilogue
3792     = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
3793
3794   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3795     {
3796       /* If peeled iterations are known but number of scalar loop
3797          iterations are unknown, count a taken branch per peeled loop.  */
3798       if (peel_iters_prologue > 0)
3799         retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3800                                    NULL, NULL_TREE, 0, vect_prologue);
3801       if (*peel_iters_epilogue > 0)
3802         retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
3803                                     NULL, NULL_TREE, 0, vect_epilogue);
3804     }
3805
3806   stmt_info_for_cost *si;
3807   int j;
3808   if (peel_iters_prologue)
3809     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3810       retval += record_stmt_cost (prologue_cost_vec,
3811                                   si->count * peel_iters_prologue,
3812                                   si->kind, si->stmt_info, si->misalign,
3813                                   vect_prologue);
3814   if (*peel_iters_epilogue)
3815     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3816       retval += record_stmt_cost (epilogue_cost_vec,
3817                                   si->count * *peel_iters_epilogue,
3818                                   si->kind, si->stmt_info, si->misalign,
3819                                   vect_epilogue);
3820
3821   return retval;
3822 }
3823
3824 /* Function vect_estimate_min_profitable_iters
3825
3826    Return the number of iterations required for the vector version of the
3827    loop to be profitable relative to the cost of the scalar version of the
3828    loop.
3829
3830    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3831    of iterations for vectorization.  -1 value means loop vectorization
3832    is not profitable.  This returned value may be used for dynamic
3833    profitability check.
3834
3835    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3836    for static check against estimated number of iterations.  */
3837
3838 static void
3839 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3840                                     int *ret_min_profitable_niters,
3841                                     int *ret_min_profitable_estimate)
3842 {
3843   int min_profitable_iters;
3844   int min_profitable_estimate;
3845   int peel_iters_prologue;
3846   int peel_iters_epilogue;
3847   unsigned vec_inside_cost = 0;
3848   int vec_outside_cost = 0;
3849   unsigned vec_prologue_cost = 0;
3850   unsigned vec_epilogue_cost = 0;
3851   int scalar_single_iter_cost = 0;
3852   int scalar_outside_cost = 0;
3853   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3854   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3855   vector_costs *target_cost_data = loop_vinfo->vector_costs;
3856
3857   /* Cost model disabled.  */
3858   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3859     {
3860       if (dump_enabled_p ())
3861         dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3862       *ret_min_profitable_niters = 0;
3863       *ret_min_profitable_estimate = 0;
3864       return;
3865     }
3866
3867   /* Requires loop versioning tests to handle misalignment.  */
3868   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3869     {
3870       /*  FIXME: Make cost depend on complexity of individual check.  */
3871       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3872       (void) add_stmt_cost (target_cost_data, len, vector_stmt,
3873                             NULL, NULL_TREE, 0, vect_prologue);
3874       if (dump_enabled_p ())
3875         dump_printf (MSG_NOTE,
3876                      "cost model: Adding cost of checks for loop "
3877                      "versioning to treat misalignment.\n");
3878     }
3879
3880   /* Requires loop versioning with alias checks.  */
3881   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3882     {
3883       /*  FIXME: Make cost depend on complexity of individual check.  */
3884       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3885       (void) add_stmt_cost (target_cost_data, len, vector_stmt,
3886                             NULL, NULL_TREE, 0, vect_prologue);
3887       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3888       if (len)
3889         /* Count LEN - 1 ANDs and LEN comparisons.  */
3890         (void) add_stmt_cost (target_cost_data, len * 2 - 1,
3891                               scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
3892       len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3893       if (len)
3894         {
3895           /* Count LEN - 1 ANDs and LEN comparisons.  */
3896           unsigned int nstmts = len * 2 - 1;
3897           /* +1 for each bias that needs adding.  */
3898           for (unsigned int i = 0; i < len; ++i)
3899             if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3900               nstmts += 1;
3901           (void) add_stmt_cost (target_cost_data, nstmts,
3902                                 scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
3903         }
3904       if (dump_enabled_p ())
3905         dump_printf (MSG_NOTE,
3906                      "cost model: Adding cost of checks for loop "
3907                      "versioning aliasing.\n");
3908     }
3909
3910   /* Requires loop versioning with niter checks.  */
3911   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3912     {
3913       /*  FIXME: Make cost depend on complexity of individual check.  */
3914       (void) add_stmt_cost (target_cost_data, 1, vector_stmt,
3915                             NULL, NULL_TREE, 0, vect_prologue);
3916       if (dump_enabled_p ())
3917         dump_printf (MSG_NOTE,
3918                      "cost model: Adding cost of checks for loop "
3919                      "versioning niters.\n");
3920     }
3921
3922   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3923     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3924                           NULL, NULL_TREE, 0, vect_prologue);
3925
3926   /* Count statements in scalar loop.  Using this as scalar cost for a single
3927      iteration for now.
3928
3929      TODO: Add outer loop support.
3930
3931      TODO: Consider assigning different costs to different scalar
3932      statements.  */
3933
3934   scalar_single_iter_cost = loop_vinfo->scalar_costs->total_cost ();
3935
3936   /* Add additional cost for the peeled instructions in prologue and epilogue
3937      loop.  (For fully-masked loops there will be no peeling.)
3938
3939      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3940      at compile-time - we assume it's vf/2 (the worst would be vf-1).
3941
3942      TODO: Build an expression that represents peel_iters for prologue and
3943      epilogue to be used in a run-time test.  */
3944
3945   bool prologue_need_br_taken_cost = false;
3946   bool prologue_need_br_not_taken_cost = false;
3947
3948   /* Calculate peel_iters_prologue.  */
3949   if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
3950     peel_iters_prologue = 0;
3951   else if (npeel < 0)
3952     {
3953       peel_iters_prologue = assumed_vf / 2;
3954       if (dump_enabled_p ())
3955         dump_printf (MSG_NOTE, "cost model: "
3956                      "prologue peel iters set to vf/2.\n");
3957
3958       /* If peeled iterations are unknown, count a taken branch and a not taken
3959          branch per peeled loop.  Even if scalar loop iterations are known,
3960          vector iterations are not known since peeled prologue iterations are
3961          not known.  Hence guards remain the same.  */
3962       prologue_need_br_taken_cost = true;
3963       prologue_need_br_not_taken_cost = true;
3964     }
3965   else
3966     {
3967       peel_iters_prologue = npeel;
3968       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
3969         /* If peeled iterations are known but number of scalar loop
3970            iterations are unknown, count a taken branch per peeled loop.  */
3971         prologue_need_br_taken_cost = true;
3972     }
3973
3974   bool epilogue_need_br_taken_cost = false;
3975   bool epilogue_need_br_not_taken_cost = false;
3976
3977   /* Calculate peel_iters_epilogue.  */
3978   if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3979     /* We need to peel exactly one iteration for gaps.  */
3980     peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
3981   else if (npeel < 0)
3982     {
3983       /* If peeling for alignment is unknown, loop bound of main loop
3984          becomes unknown.  */
3985       peel_iters_epilogue = assumed_vf / 2;
3986       if (dump_enabled_p ())
3987         dump_printf (MSG_NOTE, "cost model: "
3988                      "epilogue peel iters set to vf/2 because "
3989                      "peeling for alignment is unknown.\n");
3990
3991       /* See the same reason above in peel_iters_prologue calculation.  */
3992       epilogue_need_br_taken_cost = true;
3993       epilogue_need_br_not_taken_cost = true;
3994     }
3995   else
3996     {
3997       peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
3998       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
3999         /* If peeled iterations are known but number of scalar loop
4000            iterations are unknown, count a taken branch per peeled loop.  */
4001         epilogue_need_br_taken_cost = true;
4002     }
4003
4004   stmt_info_for_cost *si;
4005   int j;
4006   /* Add costs associated with peel_iters_prologue.  */
4007   if (peel_iters_prologue)
4008     FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4009       {
4010         (void) add_stmt_cost (target_cost_data,
4011                               si->count * peel_iters_prologue, si->kind,
4012                               si->stmt_info, si->vectype, si->misalign,
4013                               vect_prologue);
4014       }
4015
4016   /* Add costs associated with peel_iters_epilogue.  */
4017   if (peel_iters_epilogue)
4018     FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4019       {
4020         (void) add_stmt_cost (target_cost_data,
4021                               si->count * peel_iters_epilogue, si->kind,
4022                               si->stmt_info, si->vectype, si->misalign,
4023                               vect_epilogue);
4024       }
4025
4026   /* Add possible cond_branch_taken/cond_branch_not_taken cost.  */
4027
4028   if (prologue_need_br_taken_cost)
4029     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4030                           NULL, NULL_TREE, 0, vect_prologue);
4031
4032   if (prologue_need_br_not_taken_cost)
4033     (void) add_stmt_cost (target_cost_data, 1,
4034                           cond_branch_not_taken, NULL, NULL_TREE, 0,
4035                           vect_prologue);
4036
4037   if (epilogue_need_br_taken_cost)
4038     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4039                           NULL, NULL_TREE, 0, vect_epilogue);
4040
4041   if (epilogue_need_br_not_taken_cost)
4042     (void) add_stmt_cost (target_cost_data, 1,
4043                           cond_branch_not_taken, NULL, NULL_TREE, 0,
4044                           vect_epilogue);
4045
4046   /* Take care of special costs for rgroup controls of partial vectors.  */
4047   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
4048     {
4049       /* Calculate how many masks we need to generate.  */
4050       unsigned int num_masks = 0;
4051       rgroup_controls *rgm;
4052       unsigned int num_vectors_m1;
4053       FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), num_vectors_m1, rgm)
4054         if (rgm->type)
4055           num_masks += num_vectors_m1 + 1;
4056       gcc_assert (num_masks > 0);
4057
4058       /* In the worst case, we need to generate each mask in the prologue
4059          and in the loop body.  One of the loop body mask instructions
4060          replaces the comparison in the scalar loop, and since we don't
4061          count the scalar comparison against the scalar body, we shouldn't
4062          count that vector instruction against the vector body either.
4063
4064          Sometimes we can use unpacks instead of generating prologue
4065          masks and sometimes the prologue mask will fold to a constant,
4066          so the actual prologue cost might be smaller.  However, it's
4067          simpler and safer to use the worst-case cost; if this ends up
4068          being the tie-breaker between vectorizing or not, then it's
4069          probably better not to vectorize.  */
4070       (void) add_stmt_cost (target_cost_data, num_masks,
4071                             vector_stmt, NULL, NULL_TREE, 0, vect_prologue);
4072       (void) add_stmt_cost (target_cost_data, num_masks - 1,
4073                             vector_stmt, NULL, NULL_TREE, 0, vect_body);
4074     }
4075   else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
4076     {
4077       /* Referring to the functions vect_set_loop_condition_partial_vectors
4078          and vect_set_loop_controls_directly, we need to generate each
4079          length in the prologue and in the loop body if required. Although
4080          there are some possible optimizations, we consider the worst case
4081          here.  */
4082
4083       bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
4084       bool need_iterate_p
4085         = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
4086            && !vect_known_niters_smaller_than_vf (loop_vinfo));
4087
4088       /* Calculate how many statements to be added.  */
4089       unsigned int prologue_stmts = 0;
4090       unsigned int body_stmts = 0;
4091
4092       rgroup_controls *rgc;
4093       unsigned int num_vectors_m1;
4094       FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
4095         if (rgc->type)
4096           {
4097             /* May need one SHIFT for nitems_total computation.  */
4098             unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
4099             if (nitems != 1 && !niters_known_p)
4100               prologue_stmts += 1;
4101
4102             /* May need one MAX and one MINUS for wrap around.  */
4103             if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
4104               prologue_stmts += 2;
4105
4106             /* Need one MAX and one MINUS for each batch limit excepting for
4107                the 1st one.  */
4108             prologue_stmts += num_vectors_m1 * 2;
4109
4110             unsigned int num_vectors = num_vectors_m1 + 1;
4111
4112             /* Need to set up lengths in prologue, only one MIN required
4113                for each since start index is zero.  */
4114             prologue_stmts += num_vectors;
4115
4116             /* Each may need two MINs and one MINUS to update lengths in body
4117                for next iteration.  */
4118             if (need_iterate_p)
4119               body_stmts += 3 * num_vectors;
4120           }
4121
4122       (void) add_stmt_cost (target_cost_data, prologue_stmts,
4123                             scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
4124       (void) add_stmt_cost (target_cost_data, body_stmts,
4125                             scalar_stmt, NULL, NULL_TREE, 0, vect_body);
4126     }
4127
4128   /* FORNOW: The scalar outside cost is incremented in one of the
4129      following ways:
4130
4131      1. The vectorizer checks for alignment and aliasing and generates
4132      a condition that allows dynamic vectorization.  A cost model
4133      check is ANDED with the versioning condition.  Hence scalar code
4134      path now has the added cost of the versioning check.
4135
4136        if (cost > th & versioning_check)
4137          jmp to vector code
4138
4139      Hence run-time scalar is incremented by not-taken branch cost.
4140
4141      2. The vectorizer then checks if a prologue is required.  If the
4142      cost model check was not done before during versioning, it has to
4143      be done before the prologue check.
4144
4145        if (cost <= th)
4146          prologue = scalar_iters
4147        if (prologue == 0)
4148          jmp to vector code
4149        else
4150          execute prologue
4151        if (prologue == num_iters)
4152          go to exit
4153
4154      Hence the run-time scalar cost is incremented by a taken branch,
4155      plus a not-taken branch, plus a taken branch cost.
4156
4157      3. The vectorizer then checks if an epilogue is required.  If the
4158      cost model check was not done before during prologue check, it
4159      has to be done with the epilogue check.
4160
4161        if (prologue == 0)
4162          jmp to vector code
4163        else
4164          execute prologue
4165        if (prologue == num_iters)
4166          go to exit
4167        vector code:
4168          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
4169            jmp to epilogue
4170
4171      Hence the run-time scalar cost should be incremented by 2 taken
4172      branches.
4173
4174      TODO: The back end may reorder the BBS's differently and reverse
4175      conditions/branch directions.  Change the estimates below to
4176      something more reasonable.  */
4177
4178   /* If the number of iterations is known and we do not do versioning, we can
4179      decide whether to vectorize at compile time.  Hence the scalar version
4180      do not carry cost model guard costs.  */
4181   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4182       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
4183     {
4184       /* Cost model check occurs at versioning.  */
4185       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4186         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
4187       else
4188         {
4189           /* Cost model check occurs at prologue generation.  */
4190           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
4191             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
4192               + vect_get_stmt_cost (cond_branch_not_taken);
4193           /* Cost model check occurs at epilogue generation.  */
4194           else
4195             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
4196         }
4197     }
4198
4199   /* Complete the target-specific cost calculations.  */
4200   finish_cost (loop_vinfo->vector_costs, loop_vinfo->scalar_costs,
4201                &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost);
4202
4203   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
4204
4205   if (dump_enabled_p ())
4206     {
4207       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
4208       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
4209                    vec_inside_cost);
4210       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
4211                    vec_prologue_cost);
4212       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
4213                    vec_epilogue_cost);
4214       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
4215                    scalar_single_iter_cost);
4216       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
4217                    scalar_outside_cost);
4218       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
4219                    vec_outside_cost);
4220       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
4221                    peel_iters_prologue);
4222       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
4223                    peel_iters_epilogue);
4224     }
4225
4226   /* Calculate number of iterations required to make the vector version
4227      profitable, relative to the loop bodies only.  The following condition
4228      must hold true:
4229      SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
4230      where
4231      SIC = scalar iteration cost, VIC = vector iteration cost,
4232      VOC = vector outside cost, VF = vectorization factor,
4233      NPEEL = prologue iterations + epilogue iterations,
4234      SOC = scalar outside cost for run time cost model check.  */
4235
4236   int saving_per_viter = (scalar_single_iter_cost * assumed_vf
4237                           - vec_inside_cost);
4238   if (saving_per_viter <= 0)
4239     {
4240       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
4241         warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
4242                     "vectorization did not happen for a simd loop");
4243
4244       if (dump_enabled_p ())
4245         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4246                          "cost model: the vector iteration cost = %d "
4247                          "divided by the scalar iteration cost = %d "
4248                          "is greater or equal to the vectorization factor = %d"
4249                          ".\n",
4250                          vec_inside_cost, scalar_single_iter_cost, assumed_vf);
4251       *ret_min_profitable_niters = -1;
4252       *ret_min_profitable_estimate = -1;
4253       return;
4254     }
4255
4256   /* ??? The "if" arm is written to handle all cases; see below for what
4257      we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
4258   if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4259     {
4260       /* Rewriting the condition above in terms of the number of
4261          vector iterations (vniters) rather than the number of
4262          scalar iterations (niters) gives:
4263
4264          SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
4265
4266          <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
4267
4268          For integer N, X and Y when X > 0:
4269
4270          N * X > Y <==> N >= (Y /[floor] X) + 1.  */
4271       int outside_overhead = (vec_outside_cost
4272                               - scalar_single_iter_cost * peel_iters_prologue
4273                               - scalar_single_iter_cost * peel_iters_epilogue
4274                               - scalar_outside_cost);
4275       /* We're only interested in cases that require at least one
4276          vector iteration.  */
4277       int min_vec_niters = 1;
4278       if (outside_overhead > 0)
4279         min_vec_niters = outside_overhead / saving_per_viter + 1;
4280
4281       if (dump_enabled_p ())
4282         dump_printf (MSG_NOTE, "  Minimum number of vector iterations: %d\n",
4283                      min_vec_niters);
4284
4285       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4286         {
4287           /* Now that we know the minimum number of vector iterations,
4288              find the minimum niters for which the scalar cost is larger:
4289
4290              SIC * niters > VIC * vniters + VOC - SOC
4291
4292              We know that the minimum niters is no more than
4293              vniters * VF + NPEEL, but it might be (and often is) less
4294              than that if a partial vector iteration is cheaper than the
4295              equivalent scalar code.  */
4296           int threshold = (vec_inside_cost * min_vec_niters
4297                            + vec_outside_cost
4298                            - scalar_outside_cost);
4299           if (threshold <= 0)
4300             min_profitable_iters = 1;
4301           else
4302             min_profitable_iters = threshold / scalar_single_iter_cost + 1;
4303         }
4304       else
4305         /* Convert the number of vector iterations into a number of
4306            scalar iterations.  */
4307         min_profitable_iters = (min_vec_niters * assumed_vf
4308                                 + peel_iters_prologue
4309                                 + peel_iters_epilogue);
4310     }
4311   else
4312     {
4313       min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
4314                               * assumed_vf
4315                               - vec_inside_cost * peel_iters_prologue
4316                               - vec_inside_cost * peel_iters_epilogue);
4317       if (min_profitable_iters <= 0)
4318         min_profitable_iters = 0;
4319       else
4320         {
4321           min_profitable_iters /= saving_per_viter;
4322
4323           if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
4324               <= (((int) vec_inside_cost * min_profitable_iters)
4325                   + (((int) vec_outside_cost - scalar_outside_cost)
4326                      * assumed_vf)))
4327             min_profitable_iters++;
4328         }
4329     }
4330
4331   if (dump_enabled_p ())
4332     dump_printf (MSG_NOTE,
4333                  "  Calculated minimum iters for profitability: %d\n",
4334                  min_profitable_iters);
4335
4336   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
4337       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
4338     /* We want the vectorized loop to execute at least once.  */
4339     min_profitable_iters = assumed_vf + peel_iters_prologue;
4340   else if (min_profitable_iters < peel_iters_prologue)
4341     /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
4342        vectorized loop executes at least once.  */
4343     min_profitable_iters = peel_iters_prologue;
4344
4345   if (dump_enabled_p ())
4346     dump_printf_loc (MSG_NOTE, vect_location,
4347                      "  Runtime profitability threshold = %d\n",
4348                      min_profitable_iters);
4349
4350   *ret_min_profitable_niters = min_profitable_iters;
4351
4352   /* Calculate number of iterations required to make the vector version
4353      profitable, relative to the loop bodies only.
4354
4355      Non-vectorized variant is SIC * niters and it must win over vector
4356      variant on the expected loop trip count.  The following condition must hold true:
4357      SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC  */
4358
4359   if (vec_outside_cost <= 0)
4360     min_profitable_estimate = 0;
4361   /* ??? This "else if" arm is written to handle all cases; see below for
4362      what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
4363   else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4364     {
4365       /* This is a repeat of the code above, but with + SOC rather
4366          than - SOC.  */
4367       int outside_overhead = (vec_outside_cost
4368                               - scalar_single_iter_cost * peel_iters_prologue
4369                               - scalar_single_iter_cost * peel_iters_epilogue
4370                               + scalar_outside_cost);
4371       int min_vec_niters = 1;
4372       if (outside_overhead > 0)
4373         min_vec_niters = outside_overhead / saving_per_viter + 1;
4374
4375       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4376         {
4377           int threshold = (vec_inside_cost * min_vec_niters
4378                            + vec_outside_cost
4379                            + scalar_outside_cost);
4380           min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
4381         }
4382       else
4383         min_profitable_estimate = (min_vec_niters * assumed_vf
4384                                    + peel_iters_prologue
4385                                    + peel_iters_epilogue);
4386     }
4387   else
4388     {
4389       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
4390                                  * assumed_vf
4391                                  - vec_inside_cost * peel_iters_prologue
4392                                  - vec_inside_cost * peel_iters_epilogue)
4393                                  / ((scalar_single_iter_cost * assumed_vf)
4394                                    - vec_inside_cost);
4395     }
4396   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
4397   if (dump_enabled_p ())
4398     dump_printf_loc (MSG_NOTE, vect_location,
4399                      "  Static estimate profitability threshold = %d\n",
4400                      min_profitable_estimate);
4401
4402   *ret_min_profitable_estimate = min_profitable_estimate;
4403 }
4404
4405 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
4406    vector elements (not bits) for a vector with NELT elements.  */
4407 static void
4408 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
4409                               vec_perm_builder *sel)
4410 {
4411   /* The encoding is a single stepped pattern.  Any wrap-around is handled
4412      by vec_perm_indices.  */
4413   sel->new_vector (nelt, 1, 3);
4414   for (unsigned int i = 0; i < 3; i++)
4415     sel->quick_push (i + offset);
4416 }
4417
4418 /* Checks whether the target supports whole-vector shifts for vectors of mode
4419    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
4420    it supports vec_perm_const with masks for all necessary shift amounts.  */
4421 static bool
4422 have_whole_vector_shift (machine_mode mode)
4423 {
4424   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
4425     return true;
4426
4427   /* Variable-length vectors should be handled via the optab.  */
4428   unsigned int nelt;
4429   if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
4430     return false;
4431
4432   vec_perm_builder sel;
4433   vec_perm_indices indices;
4434   for (unsigned int i = nelt / 2; i >= 1; i /= 2)
4435     {
4436       calc_vec_perm_mask_for_shift (i, nelt, &sel);
4437       indices.new_vector (sel, 2, nelt);
4438       if (!can_vec_perm_const_p (mode, indices, false))
4439         return false;
4440     }
4441   return true;
4442 }
4443
4444 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
4445    functions. Design better to avoid maintenance issues.  */
4446
4447 /* Function vect_model_reduction_cost.
4448
4449    Models cost for a reduction operation, including the vector ops
4450    generated within the strip-mine loop in some cases, the initial
4451    definition before the loop, and the epilogue code that must be generated.  */
4452
4453 static void
4454 vect_model_reduction_cost (loop_vec_info loop_vinfo,
4455                            stmt_vec_info stmt_info, internal_fn reduc_fn,
4456                            vect_reduction_type reduction_type,
4457                            int ncopies, stmt_vector_for_cost *cost_vec)
4458 {
4459   int prologue_cost = 0, epilogue_cost = 0, inside_cost = 0;
4460   tree vectype;
4461   machine_mode mode;
4462   class loop *loop = NULL;
4463
4464   if (loop_vinfo)
4465     loop = LOOP_VINFO_LOOP (loop_vinfo);
4466
4467   /* Condition reductions generate two reductions in the loop.  */
4468   if (reduction_type == COND_REDUCTION)
4469     ncopies *= 2;
4470
4471   vectype = STMT_VINFO_VECTYPE (stmt_info);
4472   mode = TYPE_MODE (vectype);
4473   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4474
4475   gimple_match_op op;
4476   if (!gimple_extract_op (orig_stmt_info->stmt, &op))
4477     gcc_unreachable ();
4478
4479   if (reduction_type == EXTRACT_LAST_REDUCTION)
4480     /* No extra instructions are needed in the prologue.  The loop body
4481        operations are costed in vectorizable_condition.  */
4482     inside_cost = 0;
4483   else if (reduction_type == FOLD_LEFT_REDUCTION)
4484     {
4485       /* No extra instructions needed in the prologue.  */
4486       prologue_cost = 0;
4487
4488       if (reduc_fn != IFN_LAST)
4489         /* Count one reduction-like operation per vector.  */
4490         inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
4491                                         stmt_info, 0, vect_body);
4492       else
4493         {
4494           /* Use NELEMENTS extracts and NELEMENTS scalar ops.  */
4495           unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
4496           inside_cost = record_stmt_cost (cost_vec, nelements,
4497                                           vec_to_scalar, stmt_info, 0,
4498                                           vect_body);
4499           inside_cost += record_stmt_cost (cost_vec, nelements,
4500                                            scalar_stmt, stmt_info, 0,
4501                                            vect_body);
4502         }
4503     }
4504   else
4505     {
4506       /* Add in cost for initial definition.
4507          For cond reduction we have four vectors: initial index, step,
4508          initial result of the data reduction, initial value of the index
4509          reduction.  */
4510       int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
4511       prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
4512                                          scalar_to_vec, stmt_info, 0,
4513                                          vect_prologue);
4514     }
4515
4516   /* Determine cost of epilogue code.
4517
4518      We have a reduction operator that will reduce the vector in one statement.
4519      Also requires scalar extract.  */
4520
4521   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
4522     {
4523       if (reduc_fn != IFN_LAST)
4524         {
4525           if (reduction_type == COND_REDUCTION)
4526             {
4527               /* An EQ stmt and an COND_EXPR stmt.  */
4528               epilogue_cost += record_stmt_cost (cost_vec, 2,
4529                                                  vector_stmt, stmt_info, 0,
4530                                                  vect_epilogue);
4531               /* Reduction of the max index and a reduction of the found
4532                  values.  */
4533               epilogue_cost += record_stmt_cost (cost_vec, 2,
4534                                                  vec_to_scalar, stmt_info, 0,
4535                                                  vect_epilogue);
4536               /* A broadcast of the max value.  */
4537               epilogue_cost += record_stmt_cost (cost_vec, 1,
4538                                                  scalar_to_vec, stmt_info, 0,
4539                                                  vect_epilogue);
4540             }
4541           else
4542             {
4543               epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
4544                                                  stmt_info, 0, vect_epilogue);
4545               epilogue_cost += record_stmt_cost (cost_vec, 1,
4546                                                  vec_to_scalar, stmt_info, 0,
4547                                                  vect_epilogue);
4548             }
4549         }
4550       else if (reduction_type == COND_REDUCTION)
4551         {
4552           unsigned estimated_nunits = vect_nunits_for_cost (vectype);
4553           /* Extraction of scalar elements.  */
4554           epilogue_cost += record_stmt_cost (cost_vec,
4555                                              2 * estimated_nunits,
4556                                              vec_to_scalar, stmt_info, 0,
4557                                              vect_epilogue);
4558           /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
4559           epilogue_cost += record_stmt_cost (cost_vec,
4560                                              2 * estimated_nunits - 3,
4561                                              scalar_stmt, stmt_info, 0,
4562                                              vect_epilogue);
4563         }
4564       else if (reduction_type == EXTRACT_LAST_REDUCTION
4565                || reduction_type == FOLD_LEFT_REDUCTION)
4566         /* No extra instructions need in the epilogue.  */
4567         ;
4568       else
4569         {
4570           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4571           tree bitsize = TYPE_SIZE (op.type);
4572           int element_bitsize = tree_to_uhwi (bitsize);
4573           int nelements = vec_size_in_bits / element_bitsize;
4574
4575           if (op.code == COND_EXPR)
4576             op.code = MAX_EXPR;
4577
4578           /* We have a whole vector shift available.  */
4579           if (VECTOR_MODE_P (mode)
4580               && directly_supported_p (op.code, vectype)
4581               && have_whole_vector_shift (mode))
4582             {
4583               /* Final reduction via vector shifts and the reduction operator.
4584                  Also requires scalar extract.  */
4585               epilogue_cost += record_stmt_cost (cost_vec,
4586                                                  exact_log2 (nelements) * 2,
4587                                                  vector_stmt, stmt_info, 0,
4588                                                  vect_epilogue);
4589               epilogue_cost += record_stmt_cost (cost_vec, 1,
4590                                                  vec_to_scalar, stmt_info, 0,
4591                                                  vect_epilogue);
4592             }
4593           else
4594             /* Use extracts and reduction op for final reduction.  For N
4595                elements, we have N extracts and N-1 reduction ops.  */
4596             epilogue_cost += record_stmt_cost (cost_vec,
4597                                                nelements + nelements - 1,
4598                                                vector_stmt, stmt_info, 0,
4599                                                vect_epilogue);
4600         }
4601     }
4602
4603   if (dump_enabled_p ())
4604     dump_printf (MSG_NOTE,
4605                  "vect_model_reduction_cost: inside_cost = %d, "
4606                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4607                  prologue_cost, epilogue_cost);
4608 }
4609
4610 /* SEQ is a sequence of instructions that initialize the reduction
4611    described by REDUC_INFO.  Emit them in the appropriate place.  */
4612
4613 static void
4614 vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo,
4615                                 stmt_vec_info reduc_info, gimple *seq)
4616 {
4617   if (reduc_info->reused_accumulator)
4618     {
4619       /* When reusing an accumulator from the main loop, we only need
4620          initialization instructions if the main loop can be skipped.
4621          In that case, emit the initialization instructions at the end
4622          of the guard block that does the skip.  */
4623       edge skip_edge = loop_vinfo->skip_main_loop_edge;
4624       gcc_assert (skip_edge);
4625       gimple_stmt_iterator gsi = gsi_last_bb (skip_edge->src);
4626       gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
4627     }
4628   else
4629     {
4630       /* The normal case: emit the initialization instructions on the
4631          preheader edge.  */
4632       class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4633       gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), seq);
4634     }
4635 }
4636
4637 /* Function get_initial_def_for_reduction
4638
4639    Input:
4640    REDUC_INFO - the info_for_reduction
4641    INIT_VAL - the initial value of the reduction variable
4642    NEUTRAL_OP - a value that has no effect on the reduction, as per
4643                 neutral_op_for_reduction
4644
4645    Output:
4646    Return a vector variable, initialized according to the operation that
4647         STMT_VINFO performs. This vector will be used as the initial value
4648         of the vector of partial results.
4649
4650    The value we need is a vector in which element 0 has value INIT_VAL
4651    and every other element has value NEUTRAL_OP.  */
4652
4653 static tree
4654 get_initial_def_for_reduction (loop_vec_info loop_vinfo,
4655                                stmt_vec_info reduc_info,
4656                                tree init_val, tree neutral_op)
4657 {
4658   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4659   tree scalar_type = TREE_TYPE (init_val);
4660   tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
4661   tree init_def;
4662   gimple_seq stmts = NULL;
4663
4664   gcc_assert (vectype);
4665
4666   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4667               || SCALAR_FLOAT_TYPE_P (scalar_type));
4668
4669   gcc_assert (nested_in_vect_loop_p (loop, reduc_info)
4670               || loop == (gimple_bb (reduc_info->stmt))->loop_father);
4671
4672   if (operand_equal_p (init_val, neutral_op))
4673     {
4674       /* If both elements are equal then the vector described above is
4675          just a splat.  */
4676       neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
4677       init_def = gimple_build_vector_from_val (&stmts, vectype, neutral_op);
4678     }
4679   else
4680     {
4681       neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
4682       init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4683       if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4684         {
4685           /* Construct a splat of NEUTRAL_OP and insert INIT_VAL into
4686              element 0.  */
4687           init_def = gimple_build_vector_from_val (&stmts, vectype,
4688                                                    neutral_op);
4689           init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4690                                    vectype, init_def, init_val);
4691         }
4692       else
4693         {
4694           /* Build {INIT_VAL, NEUTRAL_OP, NEUTRAL_OP, ...}.  */
4695           tree_vector_builder elts (vectype, 1, 2);
4696           elts.quick_push (init_val);
4697           elts.quick_push (neutral_op);
4698           init_def = gimple_build_vector (&stmts, &elts);
4699         }
4700     }
4701
4702   if (stmts)
4703     vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, stmts);
4704   return init_def;
4705 }
4706
4707 /* Get at the initial defs for the reduction PHIs for REDUC_INFO,
4708    which performs a reduction involving GROUP_SIZE scalar statements.
4709    NUMBER_OF_VECTORS is the number of vector defs to create.  If NEUTRAL_OP
4710    is nonnull, introducing extra elements of that value will not change the
4711    result.  */
4712
4713 static void
4714 get_initial_defs_for_reduction (loop_vec_info loop_vinfo,
4715                                 stmt_vec_info reduc_info,
4716                                 vec<tree> *vec_oprnds,
4717                                 unsigned int number_of_vectors,
4718                                 unsigned int group_size, tree neutral_op)
4719 {
4720   vec<tree> &initial_values = reduc_info->reduc_initial_values;
4721   unsigned HOST_WIDE_INT nunits;
4722   unsigned j, number_of_places_left_in_vector;
4723   tree vector_type = STMT_VINFO_VECTYPE (reduc_info);
4724   unsigned int i;
4725
4726   gcc_assert (group_size == initial_values.length () || neutral_op);
4727
4728   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4729      created vectors. It is greater than 1 if unrolling is performed.
4730
4731      For example, we have two scalar operands, s1 and s2 (e.g., group of
4732      strided accesses of size two), while NUNITS is four (i.e., four scalars
4733      of this type can be packed in a vector).  The output vector will contain
4734      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
4735      will be 2).
4736
4737      If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4738      vectors containing the operands.
4739
4740      For example, NUNITS is four as before, and the group size is 8
4741      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
4742      {s5, s6, s7, s8}.  */
4743
4744   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4745     nunits = group_size;
4746
4747   number_of_places_left_in_vector = nunits;
4748   bool constant_p = true;
4749   tree_vector_builder elts (vector_type, nunits, 1);
4750   elts.quick_grow (nunits);
4751   gimple_seq ctor_seq = NULL;
4752   for (j = 0; j < nunits * number_of_vectors; ++j)
4753     {
4754       tree op;
4755       i = j % group_size;
4756
4757       /* Get the def before the loop.  In reduction chain we have only
4758          one initial value.  Else we have as many as PHIs in the group.  */
4759       if (i >= initial_values.length () || (j > i && neutral_op))
4760         op = neutral_op;
4761       else
4762         op = initial_values[i];
4763
4764       /* Create 'vect_ = {op0,op1,...,opn}'.  */
4765       number_of_places_left_in_vector--;
4766       elts[nunits - number_of_places_left_in_vector - 1] = op;
4767       if (!CONSTANT_CLASS_P (op))
4768         constant_p = false;
4769
4770       if (number_of_places_left_in_vector == 0)
4771         {
4772           tree init;
4773           if (constant_p && !neutral_op
4774               ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4775               : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4776             /* Build the vector directly from ELTS.  */
4777             init = gimple_build_vector (&ctor_seq, &elts);
4778           else if (neutral_op)
4779             {
4780               /* Build a vector of the neutral value and shift the
4781                  other elements into place.  */
4782               init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4783                                                    neutral_op);
4784               int k = nunits;
4785               while (k > 0 && elts[k - 1] == neutral_op)
4786                 k -= 1;
4787               while (k > 0)
4788                 {
4789                   k -= 1;
4790                   init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4791                                        vector_type, init, elts[k]);
4792                 }
4793             }
4794           else
4795             {
4796               /* First time round, duplicate ELTS to fill the
4797                  required number of vectors.  */
4798               duplicate_and_interleave (loop_vinfo, &ctor_seq, vector_type,
4799                                         elts, number_of_vectors, *vec_oprnds);
4800               break;
4801             }
4802           vec_oprnds->quick_push (init);
4803
4804           number_of_places_left_in_vector = nunits;
4805           elts.new_vector (vector_type, nunits, 1);
4806           elts.quick_grow (nunits);
4807           constant_p = true;
4808         }
4809     }
4810   if (ctor_seq != NULL)
4811     vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, ctor_seq);
4812 }
4813
4814 /* For a statement STMT_INFO taking part in a reduction operation return
4815    the stmt_vec_info the meta information is stored on.  */
4816
4817 stmt_vec_info
4818 info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
4819 {
4820   stmt_info = vect_orig_stmt (stmt_info);
4821   gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
4822   if (!is_a <gphi *> (stmt_info->stmt)
4823       || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
4824     stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4825   gphi *phi = as_a <gphi *> (stmt_info->stmt);
4826   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
4827     {
4828       if (gimple_phi_num_args (phi) == 1)
4829         stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4830     }
4831   else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
4832     {
4833       stmt_vec_info info = vinfo->lookup_def (vect_phi_initial_value (phi));
4834       if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
4835         stmt_info = info;
4836     }
4837   return stmt_info;
4838 }
4839
4840 /* See if LOOP_VINFO is an epilogue loop whose main loop had a reduction that
4841    REDUC_INFO can build on.  Adjust REDUC_INFO and return true if so, otherwise
4842    return false.  */
4843
4844 static bool
4845 vect_find_reusable_accumulator (loop_vec_info loop_vinfo,
4846                                 stmt_vec_info reduc_info)
4847 {
4848   loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
4849   if (!main_loop_vinfo)
4850     return false;
4851
4852   if (STMT_VINFO_REDUC_TYPE (reduc_info) != TREE_CODE_REDUCTION)
4853     return false;
4854
4855   unsigned int num_phis = reduc_info->reduc_initial_values.length ();
4856   auto_vec<tree, 16> main_loop_results (num_phis);
4857   auto_vec<tree, 16> initial_values (num_phis);
4858   if (edge main_loop_edge = loop_vinfo->main_loop_edge)
4859     {
4860       /* The epilogue loop can be entered either from the main loop or
4861          from an earlier guard block.  */
4862       edge skip_edge = loop_vinfo->skip_main_loop_edge;
4863       for (tree incoming_value : reduc_info->reduc_initial_values)
4864         {
4865           /* Look for:
4866
4867                INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop),
4868                                     INITIAL_VALUE(guard block)>.  */
4869           gcc_assert (TREE_CODE (incoming_value) == SSA_NAME);
4870
4871           gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value));
4872           gcc_assert (gimple_bb (phi) == main_loop_edge->dest);
4873
4874           tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge);
4875           tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge);
4876
4877           main_loop_results.quick_push (from_main_loop);
4878           initial_values.quick_push (from_skip);
4879         }
4880     }
4881   else
4882     /* The main loop dominates the epilogue loop.  */
4883     main_loop_results.splice (reduc_info->reduc_initial_values);
4884
4885   /* See if the main loop has the kind of accumulator we need.  */
4886   vect_reusable_accumulator *accumulator
4887     = main_loop_vinfo->reusable_accumulators.get (main_loop_results[0]);
4888   if (!accumulator
4889       || num_phis != accumulator->reduc_info->reduc_scalar_results.length ()
4890       || !std::equal (main_loop_results.begin (), main_loop_results.end (),
4891                       accumulator->reduc_info->reduc_scalar_results.begin ()))
4892     return false;
4893
4894   /* Handle the case where we can reduce wider vectors to narrower ones.  */
4895   tree vectype = STMT_VINFO_VECTYPE (reduc_info);
4896   tree old_vectype = TREE_TYPE (accumulator->reduc_input);
4897   if (!constant_multiple_p (TYPE_VECTOR_SUBPARTS (old_vectype),
4898                             TYPE_VECTOR_SUBPARTS (vectype)))
4899     return false;
4900
4901   /* Non-SLP reductions might apply an adjustment after the reduction
4902      operation, in order to simplify the initialization of the accumulator.
4903      If the epilogue loop carries on from where the main loop left off,
4904      it should apply the same adjustment to the final reduction result.
4905
4906      If the epilogue loop can also be entered directly (rather than via
4907      the main loop), we need to be able to handle that case in the same way,
4908      with the same adjustment.  (In principle we could add a PHI node
4909      to select the correct adjustment, but in practice that shouldn't be
4910      necessary.)  */
4911   tree main_adjustment
4912     = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (accumulator->reduc_info);
4913   if (loop_vinfo->main_loop_edge && main_adjustment)
4914     {
4915       gcc_assert (num_phis == 1);
4916       tree initial_value = initial_values[0];
4917       /* Check that we can use INITIAL_VALUE as the adjustment and
4918          initialize the accumulator with a neutral value instead.  */
4919       if (!operand_equal_p (initial_value, main_adjustment))
4920         return false;
4921       code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
4922       initial_values[0] = neutral_op_for_reduction (TREE_TYPE (initial_value),
4923                                                     code, initial_value);
4924     }
4925   STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = main_adjustment;
4926   reduc_info->reduc_initial_values.truncate (0);
4927   reduc_info->reduc_initial_values.splice (initial_values);
4928   reduc_info->reused_accumulator = accumulator;
4929   return true;
4930 }
4931
4932 /* Reduce the vector VEC_DEF down to VECTYPE with reduction operation
4933    CODE emitting stmts before GSI.  Returns a vector def of VECTYPE.  */
4934
4935 static tree
4936 vect_create_partial_epilog (tree vec_def, tree vectype, code_helper code,
4937                             gimple_seq *seq)
4938 {
4939   unsigned nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (vec_def)).to_constant ();
4940   unsigned nunits1 = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
4941   tree stype = TREE_TYPE (vectype);
4942   tree new_temp = vec_def;
4943   while (nunits > nunits1)
4944     {
4945       nunits /= 2;
4946       tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
4947                                                            stype, nunits);
4948       unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
4949
4950       /* The target has to make sure we support lowpart/highpart
4951          extraction, either via direct vector extract or through
4952          an integer mode punning.  */
4953       tree dst1, dst2;
4954       gimple *epilog_stmt;
4955       if (convert_optab_handler (vec_extract_optab,
4956                                  TYPE_MODE (TREE_TYPE (new_temp)),
4957                                  TYPE_MODE (vectype1))
4958           != CODE_FOR_nothing)
4959         {
4960           /* Extract sub-vectors directly once vec_extract becomes
4961              a conversion optab.  */
4962           dst1 = make_ssa_name (vectype1);
4963           epilog_stmt
4964               = gimple_build_assign (dst1, BIT_FIELD_REF,
4965                                      build3 (BIT_FIELD_REF, vectype1,
4966                                              new_temp, TYPE_SIZE (vectype1),
4967                                              bitsize_int (0)));
4968           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
4969           dst2 =  make_ssa_name (vectype1);
4970           epilog_stmt
4971               = gimple_build_assign (dst2, BIT_FIELD_REF,
4972                                      build3 (BIT_FIELD_REF, vectype1,
4973                                              new_temp, TYPE_SIZE (vectype1),
4974                                              bitsize_int (bitsize)));
4975           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
4976         }
4977       else
4978         {
4979           /* Extract via punning to appropriately sized integer mode
4980              vector.  */
4981           tree eltype = build_nonstandard_integer_type (bitsize, 1);
4982           tree etype = build_vector_type (eltype, 2);
4983           gcc_assert (convert_optab_handler (vec_extract_optab,
4984                                              TYPE_MODE (etype),
4985                                              TYPE_MODE (eltype))
4986                       != CODE_FOR_nothing);
4987           tree tem = make_ssa_name (etype);
4988           epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
4989                                              build1 (VIEW_CONVERT_EXPR,
4990                                                      etype, new_temp));
4991           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
4992           new_temp = tem;
4993           tem = make_ssa_name (eltype);
4994           epilog_stmt
4995               = gimple_build_assign (tem, BIT_FIELD_REF,
4996                                      build3 (BIT_FIELD_REF, eltype,
4997                                              new_temp, TYPE_SIZE (eltype),
4998                                              bitsize_int (0)));
4999           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5000           dst1 = make_ssa_name (vectype1);
5001           epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5002                                              build1 (VIEW_CONVERT_EXPR,
5003                                                      vectype1, tem));
5004           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5005           tem = make_ssa_name (eltype);
5006           epilog_stmt
5007               = gimple_build_assign (tem, BIT_FIELD_REF,
5008                                      build3 (BIT_FIELD_REF, eltype,
5009                                              new_temp, TYPE_SIZE (eltype),
5010                                              bitsize_int (bitsize)));
5011           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5012           dst2 =  make_ssa_name (vectype1);
5013           epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5014                                              build1 (VIEW_CONVERT_EXPR,
5015                                                      vectype1, tem));
5016           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5017         }
5018
5019       new_temp = gimple_build (seq, code, vectype1, dst1, dst2);
5020     }
5021
5022   return new_temp;
5023 }
5024
5025 /* Function vect_create_epilog_for_reduction
5026
5027    Create code at the loop-epilog to finalize the result of a reduction
5028    computation.
5029
5030    STMT_INFO is the scalar reduction stmt that is being vectorized.
5031    SLP_NODE is an SLP node containing a group of reduction statements. The
5032      first one in this group is STMT_INFO.
5033    SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
5034    REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
5035      (counting from 0)
5036
5037    This function:
5038    1. Completes the reduction def-use cycles.
5039    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
5040       by calling the function specified by REDUC_FN if available, or by
5041       other means (whole-vector shifts or a scalar loop).
5042       The function also creates a new phi node at the loop exit to preserve
5043       loop-closed form, as illustrated below.
5044
5045      The flow at the entry to this function:
5046
5047         loop:
5048           vec_def = phi <vec_init, null>        # REDUCTION_PHI
5049           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
5050           s_loop = scalar_stmt                  # (scalar) STMT_INFO
5051         loop_exit:
5052           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5053           use <s_out0>
5054           use <s_out0>
5055
5056      The above is transformed by this function into:
5057
5058         loop:
5059           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
5060           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
5061           s_loop = scalar_stmt                  # (scalar) STMT_INFO
5062         loop_exit:
5063           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5064           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5065           v_out2 = reduce <v_out1>
5066           s_out3 = extract_field <v_out2, 0>
5067           s_out4 = adjust_result <s_out3>
5068           use <s_out4>
5069           use <s_out4>
5070 */
5071
5072 static void
5073 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
5074                                   stmt_vec_info stmt_info,
5075                                   slp_tree slp_node,
5076                                   slp_instance slp_node_instance)
5077 {
5078   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5079   gcc_assert (reduc_info->is_reduc_info);
5080   /* For double reductions we need to get at the inner loop reduction
5081      stmt which has the meta info attached.  Our stmt_info is that of the
5082      loop-closed PHI of the inner loop which we remember as
5083      def for the reduction PHI generation.  */
5084   bool double_reduc = false;
5085   stmt_vec_info rdef_info = stmt_info;
5086   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5087     {
5088       gcc_assert (!slp_node);
5089       double_reduc = true;
5090       stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
5091                                             (stmt_info->stmt, 0));
5092       stmt_info = vect_stmt_to_vectorize (stmt_info);
5093     }
5094   gphi *reduc_def_stmt
5095     = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
5096   code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5097   internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
5098   tree vectype;
5099   machine_mode mode;
5100   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
5101   basic_block exit_bb;
5102   tree scalar_dest;
5103   tree scalar_type;
5104   gimple *new_phi = NULL, *phi;
5105   gimple_stmt_iterator exit_gsi;
5106   tree new_temp = NULL_TREE, new_name, new_scalar_dest;
5107   gimple *epilog_stmt = NULL;
5108   gimple *exit_phi;
5109   tree bitsize;
5110   tree def;
5111   tree orig_name, scalar_result;
5112   imm_use_iterator imm_iter, phi_imm_iter;
5113   use_operand_p use_p, phi_use_p;
5114   gimple *use_stmt;
5115   auto_vec<tree> reduc_inputs;
5116   int j, i;
5117   vec<tree> &scalar_results = reduc_info->reduc_scalar_results;
5118   unsigned int group_size = 1, k;
5119   auto_vec<gimple *> phis;
5120   /* SLP reduction without reduction chain, e.g.,
5121      # a1 = phi <a2, a0>
5122      # b1 = phi <b2, b0>
5123      a2 = operation (a1)
5124      b2 = operation (b1)  */
5125   bool slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
5126   bool direct_slp_reduc;
5127   tree induction_index = NULL_TREE;
5128
5129   if (slp_node)
5130     group_size = SLP_TREE_LANES (slp_node);
5131
5132   if (nested_in_vect_loop_p (loop, stmt_info))
5133     {
5134       outer_loop = loop;
5135       loop = loop->inner;
5136       gcc_assert (!slp_node && double_reduc);
5137     }
5138
5139   vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
5140   gcc_assert (vectype);
5141   mode = TYPE_MODE (vectype);
5142
5143   tree induc_val = NULL_TREE;
5144   tree adjustment_def = NULL;
5145   if (slp_node)
5146     ;
5147   else
5148     {
5149       /* Optimize: for induction condition reduction, if we can't use zero
5150          for induc_val, use initial_def.  */
5151       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5152         induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
5153       else if (double_reduc)
5154         ;
5155       else
5156         adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
5157     }
5158
5159   stmt_vec_info single_live_out_stmt[] = { stmt_info };
5160   array_slice<const stmt_vec_info> live_out_stmts = single_live_out_stmt;
5161   if (slp_reduc)
5162     /* All statements produce live-out values.  */
5163     live_out_stmts = SLP_TREE_SCALAR_STMTS (slp_node);
5164   else if (slp_node)
5165     /* The last statement in the reduction chain produces the live-out
5166        value.  */
5167     single_live_out_stmt[0] = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5168
5169   unsigned vec_num;
5170   int ncopies;
5171   if (slp_node)
5172     {
5173       vec_num = SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis).length ();
5174       ncopies = 1;
5175     }
5176   else
5177     {
5178       stmt_vec_info reduc_info = loop_vinfo->lookup_stmt (reduc_def_stmt);
5179       vec_num = 1;
5180       ncopies = STMT_VINFO_VEC_STMTS (reduc_info).length ();
5181     }
5182
5183   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
5184      which is updated with the current index of the loop for every match of
5185      the original loop's cond_expr (VEC_STMT).  This results in a vector
5186      containing the last time the condition passed for that vector lane.
5187      The first match will be a 1 to allow 0 to be used for non-matching
5188      indexes.  If there are no matches at all then the vector will be all
5189      zeroes.
5190
5191      PR92772: This algorithm is broken for architectures that support
5192      masked vectors, but do not provide fold_extract_last.  */
5193   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
5194     {
5195       auto_vec<std::pair<tree, bool>, 2> ccompares;
5196       stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
5197       cond_info = vect_stmt_to_vectorize (cond_info);
5198       while (cond_info != reduc_info)
5199         {
5200           if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
5201             {
5202               gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
5203               gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
5204               ccompares.safe_push
5205                 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
5206                                  STMT_VINFO_REDUC_IDX (cond_info) == 2));
5207             }
5208           cond_info
5209             = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
5210                                                  1 + STMT_VINFO_REDUC_IDX
5211                                                         (cond_info)));
5212           cond_info = vect_stmt_to_vectorize (cond_info);
5213         }
5214       gcc_assert (ccompares.length () != 0);
5215
5216       tree indx_before_incr, indx_after_incr;
5217       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
5218       int scalar_precision
5219         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
5220       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
5221       tree cr_index_vector_type = get_related_vectype_for_scalar_type
5222         (TYPE_MODE (vectype), cr_index_scalar_type,
5223          TYPE_VECTOR_SUBPARTS (vectype));
5224
5225       /* First we create a simple vector induction variable which starts
5226          with the values {1,2,3,...} (SERIES_VECT) and increments by the
5227          vector size (STEP).  */
5228
5229       /* Create a {1,2,3,...} vector.  */
5230       tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
5231
5232       /* Create a vector of the step value.  */
5233       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
5234       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
5235
5236       /* Create an induction variable.  */
5237       gimple_stmt_iterator incr_gsi;
5238       bool insert_after;
5239       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
5240       create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
5241                  insert_after, &indx_before_incr, &indx_after_incr);
5242
5243       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
5244          filled with zeros (VEC_ZERO).  */
5245
5246       /* Create a vector of 0s.  */
5247       tree zero = build_zero_cst (cr_index_scalar_type);
5248       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
5249
5250       /* Create a vector phi node.  */
5251       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
5252       new_phi = create_phi_node (new_phi_tree, loop->header);
5253       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
5254                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
5255
5256       /* Now take the condition from the loops original cond_exprs
5257          and produce a new cond_exprs (INDEX_COND_EXPR) which for
5258          every match uses values from the induction variable
5259          (INDEX_BEFORE_INCR) otherwise uses values from the phi node
5260          (NEW_PHI_TREE).
5261          Finally, we update the phi (NEW_PHI_TREE) to take the value of
5262          the new cond_expr (INDEX_COND_EXPR).  */
5263       gimple_seq stmts = NULL;
5264       for (int i = ccompares.length () - 1; i != -1; --i)
5265         {
5266           tree ccompare = ccompares[i].first;
5267           if (ccompares[i].second)
5268             new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5269                                          cr_index_vector_type,
5270                                          ccompare,
5271                                          indx_before_incr, new_phi_tree);
5272           else
5273             new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5274                                          cr_index_vector_type,
5275                                          ccompare,
5276                                          new_phi_tree, indx_before_incr);
5277         }
5278       gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
5279
5280       /* Update the phi with the vec cond.  */
5281       induction_index = new_phi_tree;
5282       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
5283                    loop_latch_edge (loop), UNKNOWN_LOCATION);
5284     }
5285
5286   /* 2. Create epilog code.
5287         The reduction epilog code operates across the elements of the vector
5288         of partial results computed by the vectorized loop.
5289         The reduction epilog code consists of:
5290
5291         step 1: compute the scalar result in a vector (v_out2)
5292         step 2: extract the scalar result (s_out3) from the vector (v_out2)
5293         step 3: adjust the scalar result (s_out3) if needed.
5294
5295         Step 1 can be accomplished using one the following three schemes:
5296           (scheme 1) using reduc_fn, if available.
5297           (scheme 2) using whole-vector shifts, if available.
5298           (scheme 3) using a scalar loop. In this case steps 1+2 above are
5299                      combined.
5300
5301           The overall epilog code looks like this:
5302
5303           s_out0 = phi <s_loop>         # original EXIT_PHI
5304           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
5305           v_out2 = reduce <v_out1>              # step 1
5306           s_out3 = extract_field <v_out2, 0>    # step 2
5307           s_out4 = adjust_result <s_out3>       # step 3
5308
5309           (step 3 is optional, and steps 1 and 2 may be combined).
5310           Lastly, the uses of s_out0 are replaced by s_out4.  */
5311
5312
5313   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
5314          v_out1 = phi <VECT_DEF>
5315          Store them in NEW_PHIS.  */
5316   if (double_reduc)
5317     loop = outer_loop;
5318   exit_bb = single_exit (loop)->dest;
5319   exit_gsi = gsi_after_labels (exit_bb);
5320   reduc_inputs.create (slp_node ? vec_num : ncopies);
5321   for (unsigned i = 0; i < vec_num; i++)
5322     {
5323       gimple_seq stmts = NULL;
5324       if (slp_node)
5325         def = vect_get_slp_vect_def (slp_node, i);
5326       else
5327         def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]);
5328       for (j = 0; j < ncopies; j++)
5329         {
5330           tree new_def = copy_ssa_name (def);
5331           phi = create_phi_node (new_def, exit_bb);
5332           if (j)
5333             def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
5334           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
5335           new_def = gimple_convert (&stmts, vectype, new_def);
5336           reduc_inputs.quick_push (new_def);
5337         }
5338       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5339     }
5340
5341   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
5342          (i.e. when reduc_fn is not available) and in the final adjustment
5343          code (if needed).  Also get the original scalar reduction variable as
5344          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
5345          represents a reduction pattern), the tree-code and scalar-def are
5346          taken from the original stmt that the pattern-stmt (STMT) replaces.
5347          Otherwise (it is a regular reduction) - the tree-code and scalar-def
5348          are taken from STMT.  */
5349
5350   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
5351   if (orig_stmt_info != stmt_info)
5352     {
5353       /* Reduction pattern  */
5354       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
5355       gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
5356     }
5357
5358   scalar_dest = gimple_get_lhs (orig_stmt_info->stmt);
5359   scalar_type = TREE_TYPE (scalar_dest);
5360   scalar_results.create (group_size);
5361   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
5362   bitsize = TYPE_SIZE (scalar_type);
5363
5364   /* True if we should implement SLP_REDUC using native reduction operations
5365      instead of scalar operations.  */
5366   direct_slp_reduc = (reduc_fn != IFN_LAST
5367                       && slp_reduc
5368                       && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
5369
5370   /* In case of reduction chain, e.g.,
5371      # a1 = phi <a3, a0>
5372      a2 = operation (a1)
5373      a3 = operation (a2),
5374
5375      we may end up with more than one vector result.  Here we reduce them
5376      to one vector.
5377
5378      The same is true if we couldn't use a single defuse cycle.  */
5379   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)
5380       || direct_slp_reduc
5381       || ncopies > 1)
5382     {
5383       gimple_seq stmts = NULL;
5384       tree single_input = reduc_inputs[0];
5385       for (k = 1; k < reduc_inputs.length (); k++)
5386         single_input = gimple_build (&stmts, code, vectype,
5387                                      single_input, reduc_inputs[k]);
5388       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5389
5390       reduc_inputs.truncate (0);
5391       reduc_inputs.safe_push (single_input);
5392     }
5393
5394   tree orig_reduc_input = reduc_inputs[0];
5395
5396   /* If this loop is an epilogue loop that can be skipped after the
5397      main loop, we can only share a reduction operation between the
5398      main loop and the epilogue if we put it at the target of the
5399      skip edge.
5400
5401      We can still reuse accumulators if this check fails.  Doing so has
5402      the minor(?) benefit of making the epilogue loop's scalar result
5403      independent of the main loop's scalar result.  */
5404   bool unify_with_main_loop_p = false;
5405   if (reduc_info->reused_accumulator
5406       && loop_vinfo->skip_this_loop_edge
5407       && single_succ_p (exit_bb)
5408       && single_succ (exit_bb) == loop_vinfo->skip_this_loop_edge->dest)
5409     {
5410       unify_with_main_loop_p = true;
5411
5412       basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest;
5413       reduc_inputs[0] = make_ssa_name (vectype);
5414       gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block);
5415       add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (exit_bb),
5416                    UNKNOWN_LOCATION);
5417       add_phi_arg (new_phi, reduc_info->reused_accumulator->reduc_input,
5418                    loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION);
5419       exit_gsi = gsi_after_labels (reduc_block);
5420     }
5421
5422   /* Shouldn't be used beyond this point.  */
5423   exit_bb = nullptr;
5424
5425   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
5426       && reduc_fn != IFN_LAST)
5427     {
5428       /* For condition reductions, we have a vector (REDUC_INPUTS 0) containing
5429          various data values where the condition matched and another vector
5430          (INDUCTION_INDEX) containing all the indexes of those matches.  We
5431          need to extract the last matching index (which will be the index with
5432          highest value) and use this to index into the data vector.
5433          For the case where there were no matches, the data vector will contain
5434          all default values and the index vector will be all zeros.  */
5435
5436       /* Get various versions of the type of the vector of indexes.  */
5437       tree index_vec_type = TREE_TYPE (induction_index);
5438       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
5439       tree index_scalar_type = TREE_TYPE (index_vec_type);
5440       tree index_vec_cmp_type = truth_type_for (index_vec_type);
5441
5442       /* Get an unsigned integer version of the type of the data vector.  */
5443       int scalar_precision
5444         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
5445       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
5446       tree vectype_unsigned = get_same_sized_vectype (scalar_type_unsigned,
5447                                                 vectype);
5448
5449       /* First we need to create a vector (ZERO_VEC) of zeros and another
5450          vector (MAX_INDEX_VEC) filled with the last matching index, which we
5451          can create using a MAX reduction and then expanding.
5452          In the case where the loop never made any matches, the max index will
5453          be zero.  */
5454
5455       /* Vector of {0, 0, 0,...}.  */
5456       tree zero_vec = build_zero_cst (vectype);
5457
5458       /* Find maximum value from the vector of found indexes.  */
5459       tree max_index = make_ssa_name (index_scalar_type);
5460       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5461                                                           1, induction_index);
5462       gimple_call_set_lhs (max_index_stmt, max_index);
5463       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
5464
5465       /* Vector of {max_index, max_index, max_index,...}.  */
5466       tree max_index_vec = make_ssa_name (index_vec_type);
5467       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
5468                                                       max_index);
5469       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
5470                                                         max_index_vec_rhs);
5471       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
5472
5473       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
5474          with the vector (INDUCTION_INDEX) of found indexes, choosing values
5475          from the data vector (REDUC_INPUTS 0) for matches, 0 (ZERO_VEC)
5476          otherwise.  Only one value should match, resulting in a vector
5477          (VEC_COND) with one data value and the rest zeros.
5478          In the case where the loop never made any matches, every index will
5479          match, resulting in a vector with all data values (which will all be
5480          the default value).  */
5481
5482       /* Compare the max index vector to the vector of found indexes to find
5483          the position of the max value.  */
5484       tree vec_compare = make_ssa_name (index_vec_cmp_type);
5485       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
5486                                                       induction_index,
5487                                                       max_index_vec);
5488       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
5489
5490       /* Use the compare to choose either values from the data vector or
5491          zero.  */
5492       tree vec_cond = make_ssa_name (vectype);
5493       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
5494                                                    vec_compare,
5495                                                    reduc_inputs[0],
5496                                                    zero_vec);
5497       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
5498
5499       /* Finally we need to extract the data value from the vector (VEC_COND)
5500          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
5501          reduction, but because this doesn't exist, we can use a MAX reduction
5502          instead.  The data value might be signed or a float so we need to cast
5503          it first.
5504          In the case where the loop never made any matches, the data values are
5505          all identical, and so will reduce down correctly.  */
5506
5507       /* Make the matched data values unsigned.  */
5508       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
5509       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
5510                                        vec_cond);
5511       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
5512                                                         VIEW_CONVERT_EXPR,
5513                                                         vec_cond_cast_rhs);
5514       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
5515
5516       /* Reduce down to a scalar value.  */
5517       tree data_reduc = make_ssa_name (scalar_type_unsigned);
5518       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5519                                                            1, vec_cond_cast);
5520       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
5521       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
5522
5523       /* Convert the reduced value back to the result type and set as the
5524          result.  */
5525       gimple_seq stmts = NULL;
5526       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
5527                                data_reduc);
5528       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5529       scalar_results.safe_push (new_temp);
5530     }
5531   else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
5532            && reduc_fn == IFN_LAST)
5533     {
5534       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
5535          idx = 0;
5536          idx_val = induction_index[0];
5537          val = data_reduc[0];
5538          for (idx = 0, val = init, i = 0; i < nelts; ++i)
5539            if (induction_index[i] > idx_val)
5540              val = data_reduc[i], idx_val = induction_index[i];
5541          return val;  */
5542
5543       tree data_eltype = TREE_TYPE (vectype);
5544       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5545       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5546       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5547       /* Enforced by vectorizable_reduction, which ensures we have target
5548          support before allowing a conditional reduction on variable-length
5549          vectors.  */
5550       unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5551       tree idx_val = NULL_TREE, val = NULL_TREE;
5552       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5553         {
5554           tree old_idx_val = idx_val;
5555           tree old_val = val;
5556           idx_val = make_ssa_name (idx_eltype);
5557           epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5558                                              build3 (BIT_FIELD_REF, idx_eltype,
5559                                                      induction_index,
5560                                                      bitsize_int (el_size),
5561                                                      bitsize_int (off)));
5562           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5563           val = make_ssa_name (data_eltype);
5564           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5565                                              build3 (BIT_FIELD_REF,
5566                                                      data_eltype,
5567                                                      reduc_inputs[0],
5568                                                      bitsize_int (el_size),
5569                                                      bitsize_int (off)));
5570           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5571           if (off != 0)
5572             {
5573               tree new_idx_val = idx_val;
5574               if (off != v_size - el_size)
5575                 {
5576                   new_idx_val = make_ssa_name (idx_eltype);
5577                   epilog_stmt = gimple_build_assign (new_idx_val,
5578                                                      MAX_EXPR, idx_val,
5579                                                      old_idx_val);
5580                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5581                 }
5582               tree new_val = make_ssa_name (data_eltype);
5583               epilog_stmt = gimple_build_assign (new_val,
5584                                                  COND_EXPR,
5585                                                  build2 (GT_EXPR,
5586                                                          boolean_type_node,
5587                                                          idx_val,
5588                                                          old_idx_val),
5589                                                  val, old_val);
5590               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5591               idx_val = new_idx_val;
5592               val = new_val;
5593             }
5594         }
5595       /* Convert the reduced value back to the result type and set as the
5596          result.  */
5597       gimple_seq stmts = NULL;
5598       val = gimple_convert (&stmts, scalar_type, val);
5599       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5600       scalar_results.safe_push (val);
5601     }
5602
5603   /* 2.3 Create the reduction code, using one of the three schemes described
5604          above. In SLP we simply need to extract all the elements from the
5605          vector (without reducing them), so we use scalar shifts.  */
5606   else if (reduc_fn != IFN_LAST && !slp_reduc)
5607     {
5608       tree tmp;
5609       tree vec_elem_type;
5610
5611       /* Case 1:  Create:
5612          v_out2 = reduc_expr <v_out1>  */
5613
5614       if (dump_enabled_p ())
5615         dump_printf_loc (MSG_NOTE, vect_location,
5616                          "Reduce using direct vector reduction.\n");
5617
5618       gimple_seq stmts = NULL;
5619       vec_elem_type = TREE_TYPE (vectype);
5620       new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
5621                                vec_elem_type, reduc_inputs[0]);
5622       new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5623       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5624
5625       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5626           && induc_val)
5627         {
5628           /* Earlier we set the initial value to be a vector if induc_val
5629              values.  Check the result and if it is induc_val then replace
5630              with the original initial value, unless induc_val is
5631              the same as initial_def already.  */
5632           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5633                                   induc_val);
5634           tree initial_def = reduc_info->reduc_initial_values[0];
5635
5636           tmp = make_ssa_name (new_scalar_dest);
5637           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5638                                              initial_def, new_temp);
5639           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5640           new_temp = tmp;
5641         }
5642
5643       scalar_results.safe_push (new_temp);
5644     }
5645   else if (direct_slp_reduc)
5646     {
5647       /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5648          with the elements for other SLP statements replaced with the
5649          neutral value.  We can then do a normal reduction on each vector.  */
5650
5651       /* Enforced by vectorizable_reduction.  */
5652       gcc_assert (reduc_inputs.length () == 1);
5653       gcc_assert (pow2p_hwi (group_size));
5654
5655       gimple_seq seq = NULL;
5656
5657       /* Build a vector {0, 1, 2, ...}, with the same number of elements
5658          and the same element size as VECTYPE.  */
5659       tree index = build_index_vector (vectype, 0, 1);
5660       tree index_type = TREE_TYPE (index);
5661       tree index_elt_type = TREE_TYPE (index_type);
5662       tree mask_type = truth_type_for (index_type);
5663
5664       /* Create a vector that, for each element, identifies which of
5665          the REDUC_GROUP_SIZE results should use it.  */
5666       tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5667       index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5668                             build_vector_from_val (index_type, index_mask));
5669
5670       /* Get a neutral vector value.  This is simply a splat of the neutral
5671          scalar value if we have one, otherwise the initial scalar value
5672          is itself a neutral value.  */
5673       tree vector_identity = NULL_TREE;
5674       tree neutral_op = NULL_TREE;
5675       if (slp_node)
5676         {
5677           tree initial_value = NULL_TREE;
5678           if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5679             initial_value = reduc_info->reduc_initial_values[0];
5680           neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
5681                                                  initial_value);
5682         }
5683       if (neutral_op)
5684         vector_identity = gimple_build_vector_from_val (&seq, vectype,
5685                                                         neutral_op);
5686       for (unsigned int i = 0; i < group_size; ++i)
5687         {
5688           /* If there's no univeral neutral value, we can use the
5689              initial scalar value from the original PHI.  This is used
5690              for MIN and MAX reduction, for example.  */
5691           if (!neutral_op)
5692             {
5693               tree scalar_value = reduc_info->reduc_initial_values[i];
5694               scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
5695                                              scalar_value);
5696               vector_identity = gimple_build_vector_from_val (&seq, vectype,
5697                                                               scalar_value);
5698             }
5699
5700           /* Calculate the equivalent of:
5701
5702              sel[j] = (index[j] == i);
5703
5704              which selects the elements of REDUC_INPUTS[0] that should
5705              be included in the result.  */
5706           tree compare_val = build_int_cst (index_elt_type, i);
5707           compare_val = build_vector_from_val (index_type, compare_val);
5708           tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5709                                    index, compare_val);
5710
5711           /* Calculate the equivalent of:
5712
5713              vec = seq ? reduc_inputs[0] : vector_identity;
5714
5715              VEC is now suitable for a full vector reduction.  */
5716           tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5717                                    sel, reduc_inputs[0], vector_identity);
5718
5719           /* Do the reduction and convert it to the appropriate type.  */
5720           tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5721                                       TREE_TYPE (vectype), vec);
5722           scalar = gimple_convert (&seq, scalar_type, scalar);
5723           scalar_results.safe_push (scalar);
5724         }
5725       gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5726     }
5727   else
5728     {
5729       bool reduce_with_shift;
5730       tree vec_temp;
5731
5732       gcc_assert (slp_reduc || reduc_inputs.length () == 1);
5733
5734       /* See if the target wants to do the final (shift) reduction
5735          in a vector mode of smaller size and first reduce upper/lower
5736          halves against each other.  */
5737       enum machine_mode mode1 = mode;
5738       tree stype = TREE_TYPE (vectype);
5739       unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5740       unsigned nunits1 = nunits;
5741       if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
5742           && reduc_inputs.length () == 1)
5743         {
5744           nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5745           /* For SLP reductions we have to make sure lanes match up, but
5746              since we're doing individual element final reduction reducing
5747              vector width here is even more important.
5748              ???  We can also separate lanes with permutes, for the common
5749              case of power-of-two group-size odd/even extracts would work.  */
5750           if (slp_reduc && nunits != nunits1)
5751             {
5752               nunits1 = least_common_multiple (nunits1, group_size);
5753               gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
5754             }
5755         }
5756       if (!slp_reduc
5757           && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5758         nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5759
5760       tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5761                                                            stype, nunits1);
5762       reduce_with_shift = have_whole_vector_shift (mode1);
5763       if (!VECTOR_MODE_P (mode1)
5764           || !directly_supported_p (code, vectype1))
5765         reduce_with_shift = false;
5766
5767       /* First reduce the vector to the desired vector size we should
5768          do shift reduction on by combining upper and lower halves.  */
5769       gimple_seq stmts = NULL;
5770       new_temp = vect_create_partial_epilog (reduc_inputs[0], vectype1,
5771                                              code, &stmts);
5772       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5773       reduc_inputs[0] = new_temp;
5774
5775       if (reduce_with_shift && !slp_reduc)
5776         {
5777           int element_bitsize = tree_to_uhwi (bitsize);
5778           /* Enforced by vectorizable_reduction, which disallows SLP reductions
5779              for variable-length vectors and also requires direct target support
5780              for loop reductions.  */
5781           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5782           int nelements = vec_size_in_bits / element_bitsize;
5783           vec_perm_builder sel;
5784           vec_perm_indices indices;
5785
5786           int elt_offset;
5787
5788           tree zero_vec = build_zero_cst (vectype1);
5789           /* Case 2: Create:
5790              for (offset = nelements/2; offset >= 1; offset/=2)
5791                 {
5792                   Create:  va' = vec_shift <va, offset>
5793                   Create:  va = vop <va, va'>
5794                 }  */
5795
5796           tree rhs;
5797
5798           if (dump_enabled_p ())
5799             dump_printf_loc (MSG_NOTE, vect_location,
5800                              "Reduce using vector shifts\n");
5801
5802           gimple_seq stmts = NULL;
5803           new_temp = gimple_convert (&stmts, vectype1, new_temp);
5804           for (elt_offset = nelements / 2;
5805                elt_offset >= 1;
5806                elt_offset /= 2)
5807             {
5808               calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5809               indices.new_vector (sel, 2, nelements);
5810               tree mask = vect_gen_perm_mask_any (vectype1, indices);
5811               new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
5812                                        new_temp, zero_vec, mask);
5813               new_temp = gimple_build (&stmts, code,
5814                                        vectype1, new_name, new_temp);
5815             }
5816           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5817
5818           /* 2.4  Extract the final scalar result.  Create:
5819              s_out3 = extract_field <v_out2, bitpos>  */
5820
5821           if (dump_enabled_p ())
5822             dump_printf_loc (MSG_NOTE, vect_location,
5823                              "extract scalar result\n");
5824
5825           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5826                         bitsize, bitsize_zero_node);
5827           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5828           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5829           gimple_assign_set_lhs (epilog_stmt, new_temp);
5830           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5831           scalar_results.safe_push (new_temp);
5832         }
5833       else
5834         {
5835           /* Case 3: Create:
5836              s = extract_field <v_out2, 0>
5837              for (offset = element_size;
5838                   offset < vector_size;
5839                   offset += element_size;)
5840                {
5841                  Create:  s' = extract_field <v_out2, offset>
5842                  Create:  s = op <s, s'>  // For non SLP cases
5843                }  */
5844
5845           if (dump_enabled_p ())
5846             dump_printf_loc (MSG_NOTE, vect_location,
5847                              "Reduce using scalar code.\n");
5848
5849           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5850           int element_bitsize = tree_to_uhwi (bitsize);
5851           tree compute_type = TREE_TYPE (vectype);
5852           gimple_seq stmts = NULL;
5853           FOR_EACH_VEC_ELT (reduc_inputs, i, vec_temp)
5854             {
5855               int bit_offset;
5856               new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
5857                                        vec_temp, bitsize, bitsize_zero_node);
5858
5859               /* In SLP we don't need to apply reduction operation, so we just
5860                  collect s' values in SCALAR_RESULTS.  */
5861               if (slp_reduc)
5862                 scalar_results.safe_push (new_temp);
5863
5864               for (bit_offset = element_bitsize;
5865                    bit_offset < vec_size_in_bits;
5866                    bit_offset += element_bitsize)
5867                 {
5868                   tree bitpos = bitsize_int (bit_offset);
5869                   new_name = gimple_build (&stmts, BIT_FIELD_REF,
5870                                            compute_type, vec_temp,
5871                                            bitsize, bitpos);
5872                   if (slp_reduc)
5873                     {
5874                       /* In SLP we don't need to apply reduction operation, so
5875                          we just collect s' values in SCALAR_RESULTS.  */
5876                       new_temp = new_name;
5877                       scalar_results.safe_push (new_name);
5878                     }
5879                   else
5880                     new_temp = gimple_build (&stmts, code, compute_type,
5881                                              new_name, new_temp);
5882                 }
5883             }
5884
5885           /* The only case where we need to reduce scalar results in SLP, is
5886              unrolling.  If the size of SCALAR_RESULTS is greater than
5887              REDUC_GROUP_SIZE, we reduce them combining elements modulo
5888              REDUC_GROUP_SIZE.  */
5889           if (slp_reduc)
5890             {
5891               tree res, first_res, new_res;
5892
5893               /* Reduce multiple scalar results in case of SLP unrolling.  */
5894               for (j = group_size; scalar_results.iterate (j, &res);
5895                    j++)
5896                 {
5897                   first_res = scalar_results[j % group_size];
5898                   new_res = gimple_build (&stmts, code, compute_type,
5899                                           first_res, res);
5900                   scalar_results[j % group_size] = new_res;
5901                 }
5902               scalar_results.truncate (group_size);
5903               for (k = 0; k < group_size; k++)
5904                 scalar_results[k] = gimple_convert (&stmts, scalar_type,
5905                                                     scalar_results[k]);
5906             }
5907           else
5908             {
5909               /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
5910               new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5911               scalar_results.safe_push (new_temp);
5912             }
5913
5914           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5915         }
5916
5917       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5918           && induc_val)
5919         {
5920           /* Earlier we set the initial value to be a vector if induc_val
5921              values.  Check the result and if it is induc_val then replace
5922              with the original initial value, unless induc_val is
5923              the same as initial_def already.  */
5924           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5925                                   induc_val);
5926           tree initial_def = reduc_info->reduc_initial_values[0];
5927
5928           tree tmp = make_ssa_name (new_scalar_dest);
5929           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5930                                              initial_def, new_temp);
5931           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5932           scalar_results[0] = tmp;
5933         }
5934     }
5935
5936   /* 2.5 Adjust the final result by the initial value of the reduction
5937          variable. (When such adjustment is not needed, then
5938          'adjustment_def' is zero).  For example, if code is PLUS we create:
5939          new_temp = loop_exit_def + adjustment_def  */
5940
5941   if (adjustment_def)
5942     {
5943       gcc_assert (!slp_reduc);
5944       gimple_seq stmts = NULL;
5945       if (double_reduc)
5946         {
5947           gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
5948           adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
5949           new_temp = gimple_build (&stmts, code, vectype,
5950                                    reduc_inputs[0], adjustment_def);
5951         }
5952       else
5953         {
5954           new_temp = scalar_results[0];
5955           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5956           adjustment_def = gimple_convert (&stmts, scalar_type, adjustment_def);
5957           new_temp = gimple_build (&stmts, code, scalar_type,
5958                                    new_temp, adjustment_def);
5959         }
5960
5961       epilog_stmt = gimple_seq_last_stmt (stmts);
5962       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5963       scalar_results[0] = new_temp;
5964     }
5965
5966   /* Record this operation if it could be reused by the epilogue loop.  */
5967   if (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION)
5968     loop_vinfo->reusable_accumulators.put (scalar_results[0],
5969                                            { orig_reduc_input, reduc_info });
5970
5971   if (double_reduc)
5972     loop = outer_loop;
5973
5974   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
5975           phis with new adjusted scalar results, i.e., replace use <s_out0>
5976           with use <s_out4>.
5977
5978      Transform:
5979         loop_exit:
5980           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5981           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5982           v_out2 = reduce <v_out1>
5983           s_out3 = extract_field <v_out2, 0>
5984           s_out4 = adjust_result <s_out3>
5985           use <s_out0>
5986           use <s_out0>
5987
5988      into:
5989
5990         loop_exit:
5991           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5992           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5993           v_out2 = reduce <v_out1>
5994           s_out3 = extract_field <v_out2, 0>
5995           s_out4 = adjust_result <s_out3>
5996           use <s_out4>
5997           use <s_out4> */
5998
5999   gcc_assert (live_out_stmts.size () == scalar_results.length ());
6000   for (k = 0; k < live_out_stmts.size (); k++)
6001     {
6002       stmt_vec_info scalar_stmt_info = vect_orig_stmt (live_out_stmts[k]);
6003       scalar_dest = gimple_get_lhs (scalar_stmt_info->stmt);
6004
6005       phis.create (3);
6006       /* Find the loop-closed-use at the loop exit of the original scalar
6007          result.  (The reduction result is expected to have two immediate uses,
6008          one at the latch block, and one at the loop exit).  For double
6009          reductions we are looking for exit phis of the outer loop.  */
6010       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6011         {
6012           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6013             {
6014               if (!is_gimple_debug (USE_STMT (use_p)))
6015                 phis.safe_push (USE_STMT (use_p));
6016             }
6017           else
6018             {
6019               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
6020                 {
6021                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
6022
6023                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
6024                     {
6025                       if (!flow_bb_inside_loop_p (loop,
6026                                              gimple_bb (USE_STMT (phi_use_p)))
6027                           && !is_gimple_debug (USE_STMT (phi_use_p)))
6028                         phis.safe_push (USE_STMT (phi_use_p));
6029                     }
6030                 }
6031             }
6032         }
6033
6034       FOR_EACH_VEC_ELT (phis, i, exit_phi)
6035         {
6036           /* Replace the uses:  */
6037           orig_name = PHI_RESULT (exit_phi);
6038
6039           /* Look for a single use at the target of the skip edge.  */
6040           if (unify_with_main_loop_p)
6041             {
6042               use_operand_p use_p;
6043               gimple *user;
6044               if (!single_imm_use (orig_name, &use_p, &user))
6045                 gcc_unreachable ();
6046               orig_name = gimple_get_lhs (user);
6047             }
6048
6049           scalar_result = scalar_results[k];
6050           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6051             {
6052               FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6053                 SET_USE (use_p, scalar_result);
6054               update_stmt (use_stmt);
6055             }
6056         }
6057
6058       phis.release ();
6059     }
6060 }
6061
6062 /* Return a vector of type VECTYPE that is equal to the vector select
6063    operation "MASK ? VEC : IDENTITY".  Insert the select statements
6064    before GSI.  */
6065
6066 static tree
6067 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
6068                      tree vec, tree identity)
6069 {
6070   tree cond = make_temp_ssa_name (vectype, NULL, "cond");
6071   gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
6072                                           mask, vec, identity);
6073   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6074   return cond;
6075 }
6076
6077 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6078    order, starting with LHS.  Insert the extraction statements before GSI and
6079    associate the new scalar SSA names with variable SCALAR_DEST.
6080    Return the SSA name for the result.  */
6081
6082 static tree
6083 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
6084                        tree_code code, tree lhs, tree vector_rhs)
6085 {
6086   tree vectype = TREE_TYPE (vector_rhs);
6087   tree scalar_type = TREE_TYPE (vectype);
6088   tree bitsize = TYPE_SIZE (scalar_type);
6089   unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
6090   unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
6091
6092   for (unsigned HOST_WIDE_INT bit_offset = 0;
6093        bit_offset < vec_size_in_bits;
6094        bit_offset += element_bitsize)
6095     {
6096       tree bitpos = bitsize_int (bit_offset);
6097       tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
6098                          bitsize, bitpos);
6099
6100       gassign *stmt = gimple_build_assign (scalar_dest, rhs);
6101       rhs = make_ssa_name (scalar_dest, stmt);
6102       gimple_assign_set_lhs (stmt, rhs);
6103       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6104
6105       stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
6106       tree new_name = make_ssa_name (scalar_dest, stmt);
6107       gimple_assign_set_lhs (stmt, new_name);
6108       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6109       lhs = new_name;
6110     }
6111   return lhs;
6112 }
6113
6114 /* Get a masked internal function equivalent to REDUC_FN.  VECTYPE_IN is the
6115    type of the vector input.  */
6116
6117 static internal_fn
6118 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
6119 {
6120   internal_fn mask_reduc_fn;
6121
6122   switch (reduc_fn)
6123     {
6124     case IFN_FOLD_LEFT_PLUS:
6125       mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
6126       break;
6127
6128     default:
6129       return IFN_LAST;
6130     }
6131
6132   if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
6133                                       OPTIMIZE_FOR_SPEED))
6134     return mask_reduc_fn;
6135   return IFN_LAST;
6136 }
6137
6138 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT_INFO is the
6139    statement that sets the live-out value.  REDUC_DEF_STMT is the phi
6140    statement.  CODE is the operation performed by STMT_INFO and OPS are
6141    its scalar operands.  REDUC_INDEX is the index of the operand in
6142    OPS that is set by REDUC_DEF_STMT.  REDUC_FN is the function that
6143    implements in-order reduction, or IFN_LAST if we should open-code it.
6144    VECTYPE_IN is the type of the vector input.  MASKS specifies the masks
6145    that should be used to control the operation in a fully-masked loop.  */
6146
6147 static bool
6148 vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
6149                                stmt_vec_info stmt_info,
6150                                gimple_stmt_iterator *gsi,
6151                                gimple **vec_stmt, slp_tree slp_node,
6152                                gimple *reduc_def_stmt,
6153                                tree_code code, internal_fn reduc_fn,
6154                                tree ops[3], tree vectype_in,
6155                                int reduc_index, vec_loop_masks *masks)
6156 {
6157   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6158   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6159   internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
6160
6161   int ncopies;
6162   if (slp_node)
6163     ncopies = 1;
6164   else
6165     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6166
6167   gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
6168   gcc_assert (ncopies == 1);
6169   gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
6170
6171   if (slp_node)
6172     gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
6173                           TYPE_VECTOR_SUBPARTS (vectype_in)));
6174
6175   tree op0 = ops[1 - reduc_index];
6176
6177   int group_size = 1;
6178   stmt_vec_info scalar_dest_def_info;
6179   auto_vec<tree> vec_oprnds0;
6180   if (slp_node)
6181     {
6182       auto_vec<vec<tree> > vec_defs (2);
6183       vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs);
6184       vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
6185       vec_defs[0].release ();
6186       vec_defs[1].release ();
6187       group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6188       scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
6189     }
6190   else
6191     {
6192       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
6193                                      op0, &vec_oprnds0);
6194       scalar_dest_def_info = stmt_info;
6195     }
6196
6197   tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
6198   tree scalar_type = TREE_TYPE (scalar_dest);
6199   tree reduc_var = gimple_phi_result (reduc_def_stmt);
6200
6201   int vec_num = vec_oprnds0.length ();
6202   gcc_assert (vec_num == 1 || slp_node);
6203   tree vec_elem_type = TREE_TYPE (vectype_out);
6204   gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
6205
6206   tree vector_identity = NULL_TREE;
6207   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6208     vector_identity = build_zero_cst (vectype_out);
6209
6210   tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
6211   int i;
6212   tree def0;
6213   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6214     {
6215       gimple *new_stmt;
6216       tree mask = NULL_TREE;
6217       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6218         mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
6219
6220       /* Handle MINUS by adding the negative.  */
6221       if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
6222         {
6223           tree negated = make_ssa_name (vectype_out);
6224           new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
6225           gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6226           def0 = negated;
6227         }
6228
6229       if (mask && mask_reduc_fn == IFN_LAST)
6230         def0 = merge_with_identity (gsi, mask, vectype_out, def0,
6231                                     vector_identity);
6232
6233       /* On the first iteration the input is simply the scalar phi
6234          result, and for subsequent iterations it is the output of
6235          the preceding operation.  */
6236       if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
6237         {
6238           if (mask && mask_reduc_fn != IFN_LAST)
6239             new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
6240                                                    def0, mask);
6241           else
6242             new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
6243                                                    def0);
6244           /* For chained SLP reductions the output of the previous reduction
6245              operation serves as the input of the next. For the final statement
6246              the output cannot be a temporary - we reuse the original
6247              scalar destination of the last statement.  */
6248           if (i != vec_num - 1)
6249             {
6250               gimple_set_lhs (new_stmt, scalar_dest_var);
6251               reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
6252               gimple_set_lhs (new_stmt, reduc_var);
6253             }
6254         }
6255       else
6256         {
6257           reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
6258                                              reduc_var, def0);
6259           new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6260           /* Remove the statement, so that we can use the same code paths
6261              as for statements that we've just created.  */
6262           gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6263           gsi_remove (&tmp_gsi, true);
6264         }
6265
6266       if (i == vec_num - 1)
6267         {
6268           gimple_set_lhs (new_stmt, scalar_dest);
6269           vect_finish_replace_stmt (loop_vinfo,
6270                                     scalar_dest_def_info,
6271                                     new_stmt);
6272         }
6273       else
6274         vect_finish_stmt_generation (loop_vinfo,
6275                                      scalar_dest_def_info,
6276                                      new_stmt, gsi);
6277
6278       if (slp_node)
6279         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6280       else
6281         {
6282           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
6283           *vec_stmt = new_stmt;
6284         }
6285     }
6286
6287   return true;
6288 }
6289
6290 /* Function is_nonwrapping_integer_induction.
6291
6292    Check if STMT_VINO (which is part of loop LOOP) both increments and
6293    does not cause overflow.  */
6294
6295 static bool
6296 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
6297 {
6298   gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
6299   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
6300   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
6301   tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
6302   widest_int ni, max_loop_value, lhs_max;
6303   wi::overflow_type overflow = wi::OVF_NONE;
6304
6305   /* Make sure the loop is integer based.  */
6306   if (TREE_CODE (base) != INTEGER_CST
6307       || TREE_CODE (step) != INTEGER_CST)
6308     return false;
6309
6310   /* Check that the max size of the loop will not wrap.  */
6311
6312   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6313     return true;
6314
6315   if (! max_stmt_executions (loop, &ni))
6316     return false;
6317
6318   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6319                             &overflow);
6320   if (overflow)
6321     return false;
6322
6323   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6324                             TYPE_SIGN (lhs_type), &overflow);
6325   if (overflow)
6326     return false;
6327
6328   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6329           <= TYPE_PRECISION (lhs_type));
6330 }
6331
6332 /* Check if masking can be supported by inserting a conditional expression.
6333    CODE is the code for the operation.  COND_FN is the conditional internal
6334    function, if it exists.  VECTYPE_IN is the type of the vector input.  */
6335 static bool
6336 use_mask_by_cond_expr_p (code_helper code, internal_fn cond_fn,
6337                          tree vectype_in)
6338 {
6339   if (cond_fn != IFN_LAST
6340       && direct_internal_fn_supported_p (cond_fn, vectype_in,
6341                                          OPTIMIZE_FOR_SPEED))
6342     return false;
6343
6344   if (code.is_tree_code ())
6345     switch (tree_code (code))
6346       {
6347       case DOT_PROD_EXPR:
6348       case SAD_EXPR:
6349         return true;
6350
6351       default:
6352         break;
6353       }
6354   return false;
6355 }
6356
6357 /* Insert a conditional expression to enable masked vectorization.  CODE is the
6358    code for the operation.  VOP is the array of operands.  MASK is the loop
6359    mask.  GSI is a statement iterator used to place the new conditional
6360    expression.  */
6361 static void
6362 build_vect_cond_expr (code_helper code, tree vop[3], tree mask,
6363                       gimple_stmt_iterator *gsi)
6364 {
6365   switch (tree_code (code))
6366     {
6367     case DOT_PROD_EXPR:
6368       {
6369         tree vectype = TREE_TYPE (vop[1]);
6370         tree zero = build_zero_cst (vectype);
6371         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6372         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6373                                                mask, vop[1], zero);
6374         gsi_insert_before (gsi, select, GSI_SAME_STMT);
6375         vop[1] = masked_op1;
6376         break;
6377       }
6378
6379     case SAD_EXPR:
6380       {
6381         tree vectype = TREE_TYPE (vop[1]);
6382         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6383         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6384                                                mask, vop[1], vop[0]);
6385         gsi_insert_before (gsi, select, GSI_SAME_STMT);
6386         vop[1] = masked_op1;
6387         break;
6388       }
6389
6390     default:
6391       gcc_unreachable ();
6392     }
6393 }
6394
6395 /* Function vectorizable_reduction.
6396
6397    Check if STMT_INFO performs a reduction operation that can be vectorized.
6398    If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
6399    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6400    Return true if STMT_INFO is vectorizable in this way.
6401
6402    This function also handles reduction idioms (patterns) that have been
6403    recognized in advance during vect_pattern_recog.  In this case, STMT_INFO
6404    may be of this form:
6405      X = pattern_expr (arg0, arg1, ..., X)
6406    and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
6407    sequence that had been detected and replaced by the pattern-stmt
6408    (STMT_INFO).
6409
6410    This function also handles reduction of condition expressions, for example:
6411      for (int i = 0; i < N; i++)
6412        if (a[i] < value)
6413          last = a[i];
6414    This is handled by vectorising the loop and creating an additional vector
6415    containing the loop indexes for which "a[i] < value" was true.  In the
6416    function epilogue this is reduced to a single max value and then used to
6417    index into the vector of results.
6418
6419    In some cases of reduction patterns, the type of the reduction variable X is
6420    different than the type of the other arguments of STMT_INFO.
6421    In such cases, the vectype that is used when transforming STMT_INFO into
6422    a vector stmt is different than the vectype that is used to determine the
6423    vectorization factor, because it consists of a different number of elements
6424    than the actual number of elements that are being operated upon in parallel.
6425
6426    For example, consider an accumulation of shorts into an int accumulator.
6427    On some targets it's possible to vectorize this pattern operating on 8
6428    shorts at a time (hence, the vectype for purposes of determining the
6429    vectorization factor should be V8HI); on the other hand, the vectype that
6430    is used to create the vector form is actually V4SI (the type of the result).
6431
6432    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6433    indicates what is the actual level of parallelism (V8HI in the example), so
6434    that the right vectorization factor would be derived.  This vectype
6435    corresponds to the type of arguments to the reduction stmt, and should *NOT*
6436    be used to create the vectorized stmt.  The right vectype for the vectorized
6437    stmt is obtained from the type of the result X:
6438       get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6439
6440    This means that, contrary to "regular" reductions (or "regular" stmts in
6441    general), the following equation:
6442       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6443    does *NOT* necessarily hold for reduction patterns.  */
6444
6445 bool
6446 vectorizable_reduction (loop_vec_info loop_vinfo,
6447                         stmt_vec_info stmt_info, slp_tree slp_node,
6448                         slp_instance slp_node_instance,
6449                         stmt_vector_for_cost *cost_vec)
6450 {
6451   tree vectype_in = NULL_TREE;
6452   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6453   enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
6454   stmt_vec_info cond_stmt_vinfo = NULL;
6455   int i;
6456   int ncopies;
6457   bool single_defuse_cycle = false;
6458   bool nested_cycle = false;
6459   bool double_reduc = false;
6460   int vec_num;
6461   tree tem;
6462   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6463   tree cond_reduc_val = NULL_TREE;
6464
6465   /* Make sure it was already recognized as a reduction computation.  */
6466   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
6467       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
6468       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
6469     return false;
6470
6471   /* The stmt we store reduction analysis meta on.  */
6472   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
6473   reduc_info->is_reduc_info = true;
6474
6475   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
6476     {
6477       if (is_a <gphi *> (stmt_info->stmt))
6478         {
6479           if (slp_node)
6480             {
6481               /* We eventually need to set a vector type on invariant
6482                  arguments.  */
6483               unsigned j;
6484               slp_tree child;
6485               FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
6486                 if (!vect_maybe_update_slp_op_vectype
6487                        (child, SLP_TREE_VECTYPE (slp_node)))
6488                   {
6489                     if (dump_enabled_p ())
6490                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6491                                        "incompatible vector types for "
6492                                        "invariants\n");
6493                     return false;
6494                   }
6495             }
6496           /* Analysis for double-reduction is done on the outer
6497              loop PHI, nested cycles have no further restrictions.  */
6498           STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
6499         }
6500       else
6501         STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6502       return true;
6503     }
6504
6505   stmt_vec_info orig_stmt_of_analysis = stmt_info;
6506   stmt_vec_info phi_info = stmt_info;
6507   if (!is_a <gphi *> (stmt_info->stmt))
6508     {
6509       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6510       return true;
6511     }
6512   if (slp_node)
6513     {
6514       slp_node_instance->reduc_phis = slp_node;
6515       /* ???  We're leaving slp_node to point to the PHIs, we only
6516          need it to get at the number of vector stmts which wasn't
6517          yet initialized for the instance root.  */
6518     }
6519   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
6520     stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (stmt_info));
6521   else
6522     {
6523       gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info)
6524                   == vect_double_reduction_def);
6525       use_operand_p use_p;
6526       gimple *use_stmt;
6527       bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
6528                                  &use_p, &use_stmt);
6529       gcc_assert (res);
6530       phi_info = loop_vinfo->lookup_stmt (use_stmt);
6531       stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
6532     }
6533
6534   /* PHIs should not participate in patterns.  */
6535   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6536   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
6537
6538   /* Verify following REDUC_IDX from the latch def leads us back to the PHI
6539      and compute the reduction chain length.  Discover the real
6540      reduction operation stmt on the way (stmt_info and slp_for_stmt_info).  */
6541   tree reduc_def
6542     = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
6543                              loop_latch_edge
6544                                (gimple_bb (reduc_def_phi)->loop_father));
6545   unsigned reduc_chain_length = 0;
6546   bool only_slp_reduc_chain = true;
6547   stmt_info = NULL;
6548   slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
6549   while (reduc_def != PHI_RESULT (reduc_def_phi))
6550     {
6551       stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
6552       stmt_vec_info vdef = vect_stmt_to_vectorize (def);
6553       if (STMT_VINFO_REDUC_IDX (vdef) == -1)
6554         {
6555           if (dump_enabled_p ())
6556             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6557                              "reduction chain broken by patterns.\n");
6558           return false;
6559         }
6560       if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
6561         only_slp_reduc_chain = false;
6562       /* ???  For epilogue generation live members of the chain need
6563          to point back to the PHI via their original stmt for
6564          info_for_reduction to work.  */
6565       if (STMT_VINFO_LIVE_P (vdef))
6566         STMT_VINFO_REDUC_DEF (def) = phi_info;
6567       gimple_match_op op;
6568       if (!gimple_extract_op (vdef->stmt, &op))
6569         {
6570           if (dump_enabled_p ())
6571             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6572                              "reduction chain includes unsupported"
6573                              " statement type.\n");
6574           return false;
6575         }
6576       if (CONVERT_EXPR_CODE_P (op.code))
6577         {
6578           if (!tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
6579             {
6580               if (dump_enabled_p ())
6581                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6582                                  "conversion in the reduction chain.\n");
6583               return false;
6584             }
6585         }
6586       else if (!stmt_info)
6587         /* First non-conversion stmt.  */
6588         stmt_info = vdef;
6589       reduc_def = op.ops[STMT_VINFO_REDUC_IDX (vdef)];
6590       reduc_chain_length++;
6591       if (!stmt_info && slp_node)
6592         slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
6593     }
6594   /* PHIs should not participate in patterns.  */
6595   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6596
6597   if (nested_in_vect_loop_p (loop, stmt_info))
6598     {
6599       loop = loop->inner;
6600       nested_cycle = true;
6601     }
6602
6603   /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
6604      element.  */
6605   if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6606     {
6607       gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
6608       stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
6609     }
6610   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6611     gcc_assert (slp_node
6612                 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
6613
6614   /* 1. Is vectorizable reduction?  */
6615   /* Not supportable if the reduction variable is used in the loop, unless
6616      it's a reduction chain.  */
6617   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6618       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6619     return false;
6620
6621   /* Reductions that are not used even in an enclosing outer-loop,
6622      are expected to be "live" (used out of the loop).  */
6623   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6624       && !STMT_VINFO_LIVE_P (stmt_info))
6625     return false;
6626
6627   /* 2. Has this been recognized as a reduction pattern?
6628
6629      Check if STMT represents a pattern that has been recognized
6630      in earlier analysis stages.  For stmts that represent a pattern,
6631      the STMT_VINFO_RELATED_STMT field records the last stmt in
6632      the original sequence that constitutes the pattern.  */
6633
6634   stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6635   if (orig_stmt_info)
6636     {
6637       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6638       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6639     }
6640
6641   /* 3. Check the operands of the operation.  The first operands are defined
6642         inside the loop body. The last operand is the reduction variable,
6643         which is defined by the loop-header-phi.  */
6644
6645   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6646   STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
6647   gimple_match_op op;
6648   if (!gimple_extract_op (stmt_info->stmt, &op))
6649     gcc_unreachable ();
6650   bool lane_reduc_code_p = (op.code == DOT_PROD_EXPR
6651                             || op.code == WIDEN_SUM_EXPR
6652                             || op.code == SAD_EXPR);
6653   enum optab_subtype optab_query_kind = optab_vector;
6654   if (op.code == DOT_PROD_EXPR
6655       && (TYPE_SIGN (TREE_TYPE (op.ops[0]))
6656           != TYPE_SIGN (TREE_TYPE (op.ops[1]))))
6657     optab_query_kind = optab_vector_mixed_sign;
6658
6659   if (!POINTER_TYPE_P (op.type) && !INTEGRAL_TYPE_P (op.type)
6660       && !SCALAR_FLOAT_TYPE_P (op.type))
6661     return false;
6662
6663   /* Do not try to vectorize bit-precision reductions.  */
6664   if (!type_has_mode_precision_p (op.type))
6665     return false;
6666
6667   /* For lane-reducing ops we're reducing the number of reduction PHIs
6668      which means the only use of that may be in the lane-reducing operation.  */
6669   if (lane_reduc_code_p
6670       && reduc_chain_length != 1
6671       && !only_slp_reduc_chain)
6672     {
6673       if (dump_enabled_p ())
6674         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6675                          "lane-reducing reduction with extra stmts.\n");
6676       return false;
6677     }
6678
6679   /* All uses but the last are expected to be defined in the loop.
6680      The last use is the reduction variable.  In case of nested cycle this
6681      assumption is not true: we use reduc_index to record the index of the
6682      reduction variable.  */
6683   slp_tree *slp_op = XALLOCAVEC (slp_tree, op.num_ops);
6684   /* We need to skip an extra operand for COND_EXPRs with embedded
6685      comparison.  */
6686   unsigned opno_adjust = 0;
6687   if (op.code == COND_EXPR && COMPARISON_CLASS_P (op.ops[0]))
6688     opno_adjust = 1;
6689   for (i = 0; i < (int) op.num_ops; i++)
6690     {
6691       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
6692       if (i == 0 && op.code == COND_EXPR)
6693         continue;
6694
6695       stmt_vec_info def_stmt_info;
6696       enum vect_def_type dt;
6697       if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info,
6698                                i + opno_adjust, &op.ops[i], &slp_op[i], &dt,
6699                                &tem, &def_stmt_info))
6700         {
6701           if (dump_enabled_p ())
6702             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6703                              "use not simple.\n");
6704           return false;
6705         }
6706       if (i == STMT_VINFO_REDUC_IDX (stmt_info))
6707         continue;
6708
6709       /* There should be only one cycle def in the stmt, the one
6710          leading to reduc_def.  */
6711       if (VECTORIZABLE_CYCLE_DEF (dt))
6712         return false;
6713
6714       /* To properly compute ncopies we are interested in the widest
6715          non-reduction input type in case we're looking at a widening
6716          accumulation that we later handle in vect_transform_reduction.  */
6717       if (lane_reduc_code_p
6718           && tem
6719           && (!vectype_in
6720               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6721                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem))))))
6722         vectype_in = tem;
6723
6724       if (op.code == COND_EXPR)
6725         {
6726           /* Record how the non-reduction-def value of COND_EXPR is defined.  */
6727           if (dt == vect_constant_def)
6728             {
6729               cond_reduc_dt = dt;
6730               cond_reduc_val = op.ops[i];
6731             }
6732           if (dt == vect_induction_def
6733               && def_stmt_info
6734               && is_nonwrapping_integer_induction (def_stmt_info, loop))
6735             {
6736               cond_reduc_dt = dt;
6737               cond_stmt_vinfo = def_stmt_info;
6738             }
6739         }
6740     }
6741   if (!vectype_in)
6742     vectype_in = STMT_VINFO_VECTYPE (phi_info);
6743   STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
6744
6745   enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
6746   STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
6747   /* If we have a condition reduction, see if we can simplify it further.  */
6748   if (v_reduc_type == COND_REDUCTION)
6749     {
6750       if (slp_node)
6751         return false;
6752
6753       /* When the condition uses the reduction value in the condition, fail.  */
6754       if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
6755         {
6756           if (dump_enabled_p ())
6757             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6758                              "condition depends on previous iteration\n");
6759           return false;
6760         }
6761
6762       if (reduc_chain_length == 1
6763           && direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6764                                              vectype_in, OPTIMIZE_FOR_SPEED))
6765         {
6766           if (dump_enabled_p ())
6767             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6768                              "optimizing condition reduction with"
6769                              " FOLD_EXTRACT_LAST.\n");
6770           STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
6771         }
6772       else if (cond_reduc_dt == vect_induction_def)
6773         {
6774           tree base
6775             = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6776           tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6777
6778           gcc_assert (TREE_CODE (base) == INTEGER_CST
6779                       && TREE_CODE (step) == INTEGER_CST);
6780           cond_reduc_val = NULL_TREE;
6781           enum tree_code cond_reduc_op_code = ERROR_MARK;
6782           tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
6783           if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
6784             ;
6785           /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6786              above base; punt if base is the minimum value of the type for
6787              MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
6788           else if (tree_int_cst_sgn (step) == -1)
6789             {
6790               cond_reduc_op_code = MIN_EXPR;
6791               if (tree_int_cst_sgn (base) == -1)
6792                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6793               else if (tree_int_cst_lt (base,
6794                                         TYPE_MAX_VALUE (TREE_TYPE (base))))
6795                 cond_reduc_val
6796                   = int_const_binop (PLUS_EXPR, base, integer_one_node);
6797             }
6798           else
6799             {
6800               cond_reduc_op_code = MAX_EXPR;
6801               if (tree_int_cst_sgn (base) == 1)
6802                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6803               else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6804                                         base))
6805                 cond_reduc_val
6806                   = int_const_binop (MINUS_EXPR, base, integer_one_node);
6807             }
6808           if (cond_reduc_val)
6809             {
6810               if (dump_enabled_p ())
6811                 dump_printf_loc (MSG_NOTE, vect_location,
6812                                  "condition expression based on "
6813                                  "integer induction.\n");
6814               STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
6815               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
6816                 = cond_reduc_val;
6817               STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
6818             }
6819         }
6820       else if (cond_reduc_dt == vect_constant_def)
6821         {
6822           enum vect_def_type cond_initial_dt;
6823           tree cond_initial_val = vect_phi_initial_value (reduc_def_phi);
6824           vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
6825           if (cond_initial_dt == vect_constant_def
6826               && types_compatible_p (TREE_TYPE (cond_initial_val),
6827                                      TREE_TYPE (cond_reduc_val)))
6828             {
6829               tree e = fold_binary (LE_EXPR, boolean_type_node,
6830                                     cond_initial_val, cond_reduc_val);
6831               if (e && (integer_onep (e) || integer_zerop (e)))
6832                 {
6833                   if (dump_enabled_p ())
6834                     dump_printf_loc (MSG_NOTE, vect_location,
6835                                      "condition expression based on "
6836                                      "compile time constant.\n");
6837                   /* Record reduction code at analysis stage.  */
6838                   STMT_VINFO_REDUC_CODE (reduc_info)
6839                     = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6840                   STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
6841                 }
6842             }
6843         }
6844     }
6845
6846   if (STMT_VINFO_LIVE_P (phi_info))
6847     return false;
6848
6849   if (slp_node)
6850     ncopies = 1;
6851   else
6852     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6853
6854   gcc_assert (ncopies >= 1);
6855
6856   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6857
6858   if (nested_cycle)
6859     {
6860       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
6861                   == vect_double_reduction_def);
6862       double_reduc = true;
6863     }
6864
6865   /* 4.2. Check support for the epilog operation.
6866
6867           If STMT represents a reduction pattern, then the type of the
6868           reduction variable may be different than the type of the rest
6869           of the arguments.  For example, consider the case of accumulation
6870           of shorts into an int accumulator; The original code:
6871                         S1: int_a = (int) short_a;
6872           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
6873
6874           was replaced with:
6875                         STMT: int_acc = widen_sum <short_a, int_acc>
6876
6877           This means that:
6878           1. The tree-code that is used to create the vector operation in the
6879              epilog code (that reduces the partial results) is not the
6880              tree-code of STMT, but is rather the tree-code of the original
6881              stmt from the pattern that STMT is replacing.  I.e, in the example
6882              above we want to use 'widen_sum' in the loop, but 'plus' in the
6883              epilog.
6884           2. The type (mode) we use to check available target support
6885              for the vector operation to be created in the *epilog*, is
6886              determined by the type of the reduction variable (in the example
6887              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6888              However the type (mode) we use to check available target support
6889              for the vector operation to be created *inside the loop*, is
6890              determined by the type of the other arguments to STMT (in the
6891              example we'd check this: optab_handler (widen_sum_optab,
6892              vect_short_mode)).
6893
6894           This is contrary to "regular" reductions, in which the types of all
6895           the arguments are the same as the type of the reduction variable.
6896           For "regular" reductions we can therefore use the same vector type
6897           (and also the same tree-code) when generating the epilog code and
6898           when generating the code inside the loop.  */
6899
6900   code_helper orig_code = STMT_VINFO_REDUC_CODE (phi_info);
6901   STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
6902
6903   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
6904   if (reduction_type == TREE_CODE_REDUCTION)
6905     {
6906       /* Check whether it's ok to change the order of the computation.
6907          Generally, when vectorizing a reduction we change the order of the
6908          computation.  This may change the behavior of the program in some
6909          cases, so we need to check that this is ok.  One exception is when
6910          vectorizing an outer-loop: the inner-loop is executed sequentially,
6911          and therefore vectorizing reductions in the inner-loop during
6912          outer-loop vectorization is safe.  Likewise when we are vectorizing
6913          a series of reductions using SLP and the VF is one the reductions
6914          are performed in scalar order.  */
6915       if (slp_node
6916           && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6917           && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1u))
6918         ;
6919       else if (needs_fold_left_reduction_p (op.type, orig_code))
6920         {
6921           /* When vectorizing a reduction chain w/o SLP the reduction PHI
6922              is not directy used in stmt.  */
6923           if (!only_slp_reduc_chain
6924               && reduc_chain_length != 1)
6925             {
6926               if (dump_enabled_p ())
6927                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6928                                  "in-order reduction chain without SLP.\n");
6929               return false;
6930             }
6931           STMT_VINFO_REDUC_TYPE (reduc_info)
6932             = reduction_type = FOLD_LEFT_REDUCTION;
6933         }
6934       else if (!commutative_binary_op_p (orig_code, op.type)
6935                || !associative_binary_op_p (orig_code, op.type))
6936         {
6937           if (dump_enabled_p ())
6938             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6939                             "reduction: not commutative/associative");
6940           return false;
6941         }
6942     }
6943
6944   if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
6945       && ncopies > 1)
6946     {
6947       if (dump_enabled_p ())
6948         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6949                          "multiple types in double reduction or condition "
6950                          "reduction or fold-left reduction.\n");
6951       return false;
6952     }
6953
6954   internal_fn reduc_fn = IFN_LAST;
6955   if (reduction_type == TREE_CODE_REDUCTION
6956       || reduction_type == FOLD_LEFT_REDUCTION
6957       || reduction_type == INTEGER_INDUC_COND_REDUCTION
6958       || reduction_type == CONST_COND_REDUCTION)
6959     {
6960       if (reduction_type == FOLD_LEFT_REDUCTION
6961           ? fold_left_reduction_fn (orig_code, &reduc_fn)
6962           : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6963         {
6964           if (reduc_fn != IFN_LAST
6965               && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6966                                                   OPTIMIZE_FOR_SPEED))
6967             {
6968               if (dump_enabled_p ())
6969                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6970                                  "reduc op not supported by target.\n");
6971
6972               reduc_fn = IFN_LAST;
6973             }
6974         }
6975       else
6976         {
6977           if (!nested_cycle || double_reduc)
6978             {
6979               if (dump_enabled_p ())
6980                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6981                                  "no reduc code for scalar code.\n");
6982
6983               return false;
6984             }
6985         }
6986     }
6987   else if (reduction_type == COND_REDUCTION)
6988     {
6989       int scalar_precision
6990         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (op.type));
6991       cr_index_scalar_type = make_unsigned_type (scalar_precision);
6992       cr_index_vector_type = get_same_sized_vectype (cr_index_scalar_type,
6993                                                 vectype_out);
6994
6995       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
6996                                           OPTIMIZE_FOR_SPEED))
6997         reduc_fn = IFN_REDUC_MAX;
6998     }
6999   STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
7000
7001   if (reduction_type != EXTRACT_LAST_REDUCTION
7002       && (!nested_cycle || double_reduc)
7003       && reduc_fn == IFN_LAST
7004       && !nunits_out.is_constant ())
7005     {
7006       if (dump_enabled_p ())
7007         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7008                          "missing target support for reduction on"
7009                          " variable-length vectors.\n");
7010       return false;
7011     }
7012
7013   /* For SLP reductions, see if there is a neutral value we can use.  */
7014   tree neutral_op = NULL_TREE;
7015   if (slp_node)
7016     {
7017       tree initial_value = NULL_TREE;
7018       if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL)
7019         initial_value = vect_phi_initial_value (reduc_def_phi);
7020       neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype_out),
7021                                              orig_code, initial_value);
7022     }
7023
7024   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
7025     {
7026       /* We can't support in-order reductions of code such as this:
7027
7028            for (int i = 0; i < n1; ++i)
7029              for (int j = 0; j < n2; ++j)
7030                l += a[j];
7031
7032          since GCC effectively transforms the loop when vectorizing:
7033
7034            for (int i = 0; i < n1 / VF; ++i)
7035              for (int j = 0; j < n2; ++j)
7036                for (int k = 0; k < VF; ++k)
7037                  l += a[j];
7038
7039          which is a reassociation of the original operation.  */
7040       if (dump_enabled_p ())
7041         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7042                          "in-order double reduction not supported.\n");
7043
7044       return false;
7045     }
7046
7047   if (reduction_type == FOLD_LEFT_REDUCTION
7048       && slp_node
7049       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7050     {
7051       /* We cannot use in-order reductions in this case because there is
7052          an implicit reassociation of the operations involved.  */
7053       if (dump_enabled_p ())
7054         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7055                          "in-order unchained SLP reductions not supported.\n");
7056       return false;
7057     }
7058
7059   /* For double reductions, and for SLP reductions with a neutral value,
7060      we construct a variable-length initial vector by loading a vector
7061      full of the neutral value and then shift-and-inserting the start
7062      values into the low-numbered elements.  */
7063   if ((double_reduc || neutral_op)
7064       && !nunits_out.is_constant ()
7065       && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
7066                                           vectype_out, OPTIMIZE_FOR_SPEED))
7067     {
7068       if (dump_enabled_p ())
7069         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7070                          "reduction on variable-length vectors requires"
7071                          " target support for a vector-shift-and-insert"
7072                          " operation.\n");
7073       return false;
7074     }
7075
7076   /* Check extra constraints for variable-length unchained SLP reductions.  */
7077   if (STMT_SLP_TYPE (stmt_info)
7078       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7079       && !nunits_out.is_constant ())
7080     {
7081       /* We checked above that we could build the initial vector when
7082          there's a neutral element value.  Check here for the case in
7083          which each SLP statement has its own initial value and in which
7084          that value needs to be repeated for every instance of the
7085          statement within the initial vector.  */
7086       unsigned int group_size = SLP_TREE_LANES (slp_node);
7087       if (!neutral_op
7088           && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
7089                                               TREE_TYPE (vectype_out)))
7090         {
7091           if (dump_enabled_p ())
7092             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7093                              "unsupported form of SLP reduction for"
7094                              " variable-length vectors: cannot build"
7095                              " initial vector.\n");
7096           return false;
7097         }
7098       /* The epilogue code relies on the number of elements being a multiple
7099          of the group size.  The duplicate-and-interleave approach to setting
7100          up the initial vector does too.  */
7101       if (!multiple_p (nunits_out, group_size))
7102         {
7103           if (dump_enabled_p ())
7104             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7105                              "unsupported form of SLP reduction for"
7106                              " variable-length vectors: the vector size"
7107                              " is not a multiple of the number of results.\n");
7108           return false;
7109         }
7110     }
7111
7112   if (reduction_type == COND_REDUCTION)
7113     {
7114       widest_int ni;
7115
7116       if (! max_loop_iterations (loop, &ni))
7117         {
7118           if (dump_enabled_p ())
7119             dump_printf_loc (MSG_NOTE, vect_location,
7120                              "loop count not known, cannot create cond "
7121                              "reduction.\n");
7122           return false;
7123         }
7124       /* Convert backedges to iterations.  */
7125       ni += 1;
7126
7127       /* The additional index will be the same type as the condition.  Check
7128          that the loop can fit into this less one (because we'll use up the
7129          zero slot for when there are no matches).  */
7130       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
7131       if (wi::geu_p (ni, wi::to_widest (max_index)))
7132         {
7133           if (dump_enabled_p ())
7134             dump_printf_loc (MSG_NOTE, vect_location,
7135                              "loop size is greater than data size.\n");
7136           return false;
7137         }
7138     }
7139
7140   /* In case the vectorization factor (VF) is bigger than the number
7141      of elements that we can fit in a vectype (nunits), we have to generate
7142      more than one vector stmt - i.e - we need to "unroll" the
7143      vector stmt by a factor VF/nunits.  For more details see documentation
7144      in vectorizable_operation.  */
7145
7146   /* If the reduction is used in an outer loop we need to generate
7147      VF intermediate results, like so (e.g. for ncopies=2):
7148         r0 = phi (init, r0)
7149         r1 = phi (init, r1)
7150         r0 = x0 + r0;
7151         r1 = x1 + r1;
7152     (i.e. we generate VF results in 2 registers).
7153     In this case we have a separate def-use cycle for each copy, and therefore
7154     for each copy we get the vector def for the reduction variable from the
7155     respective phi node created for this copy.
7156
7157     Otherwise (the reduction is unused in the loop nest), we can combine
7158     together intermediate results, like so (e.g. for ncopies=2):
7159         r = phi (init, r)
7160         r = x0 + r;
7161         r = x1 + r;
7162    (i.e. we generate VF/2 results in a single register).
7163    In this case for each copy we get the vector def for the reduction variable
7164    from the vectorized reduction operation generated in the previous iteration.
7165
7166    This only works when we see both the reduction PHI and its only consumer
7167    in vectorizable_reduction and there are no intermediate stmts
7168    participating.  */
7169   if (ncopies > 1
7170       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
7171       && reduc_chain_length == 1)
7172     single_defuse_cycle = true;
7173
7174   if (single_defuse_cycle || lane_reduc_code_p)
7175     {
7176       gcc_assert (op.code != COND_EXPR);
7177
7178       /* 4. Supportable by target?  */
7179       bool ok = true;
7180
7181       /* 4.1. check support for the operation in the loop  */
7182       machine_mode vec_mode = TYPE_MODE (vectype_in);
7183       if (!directly_supported_p (op.code, vectype_in, optab_query_kind))
7184         {
7185           if (dump_enabled_p ())
7186             dump_printf (MSG_NOTE, "op not supported by target.\n");
7187           if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
7188               || !vect_can_vectorize_without_simd_p (op.code))
7189             ok = false;
7190           else
7191             if (dump_enabled_p ())
7192               dump_printf (MSG_NOTE, "proceeding using word mode.\n");
7193         }
7194
7195       if (vect_emulated_vector_p (vectype_in)
7196           && !vect_can_vectorize_without_simd_p (op.code))
7197         {
7198           if (dump_enabled_p ())
7199             dump_printf (MSG_NOTE, "using word mode not possible.\n");
7200           return false;
7201         }
7202
7203       /* lane-reducing operations have to go through vect_transform_reduction.
7204          For the other cases try without the single cycle optimization.  */
7205       if (!ok)
7206         {
7207           if (lane_reduc_code_p)
7208             return false;
7209           else
7210             single_defuse_cycle = false;
7211         }
7212     }
7213   STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
7214
7215   /* If the reduction stmt is one of the patterns that have lane
7216      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
7217   if ((ncopies > 1 && ! single_defuse_cycle)
7218       && lane_reduc_code_p)
7219     {
7220       if (dump_enabled_p ())
7221         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7222                          "multi def-use cycle not possible for lane-reducing "
7223                          "reduction operation\n");
7224       return false;
7225     }
7226
7227   if (slp_node
7228       && !(!single_defuse_cycle
7229            && !lane_reduc_code_p
7230            && reduction_type != FOLD_LEFT_REDUCTION))
7231     for (i = 0; i < (int) op.num_ops; i++)
7232       if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_in))
7233         {
7234           if (dump_enabled_p ())
7235             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7236                              "incompatible vector types for invariants\n");
7237           return false;
7238         }
7239
7240   if (slp_node)
7241     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7242   else
7243     vec_num = 1;
7244
7245   vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
7246                              reduction_type, ncopies, cost_vec);
7247   /* Cost the reduction op inside the loop if transformed via
7248      vect_transform_reduction.  Otherwise this is costed by the
7249      separate vectorizable_* routines.  */
7250   if (single_defuse_cycle || lane_reduc_code_p)
7251     record_stmt_cost (cost_vec, ncopies, vector_stmt, stmt_info, 0, vect_body);
7252
7253   if (dump_enabled_p ()
7254       && reduction_type == FOLD_LEFT_REDUCTION)
7255     dump_printf_loc (MSG_NOTE, vect_location,
7256                      "using an in-order (fold-left) reduction.\n");
7257   STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
7258   /* All but single defuse-cycle optimized, lane-reducing and fold-left
7259      reductions go through their own vectorizable_* routines.  */
7260   if (!single_defuse_cycle
7261       && !lane_reduc_code_p
7262       && reduction_type != FOLD_LEFT_REDUCTION)
7263     {
7264       stmt_vec_info tem
7265         = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
7266       if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
7267         {
7268           gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
7269           tem = REDUC_GROUP_FIRST_ELEMENT (tem);
7270         }
7271       STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
7272       STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
7273     }
7274   else if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
7275     {
7276       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7277       internal_fn cond_fn = get_conditional_internal_fn (op.code, op.type);
7278
7279       if (reduction_type != FOLD_LEFT_REDUCTION
7280           && !use_mask_by_cond_expr_p (op.code, cond_fn, vectype_in)
7281           && (cond_fn == IFN_LAST
7282               || !direct_internal_fn_supported_p (cond_fn, vectype_in,
7283                                                   OPTIMIZE_FOR_SPEED)))
7284         {
7285           if (dump_enabled_p ())
7286             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7287                              "can't operate on partial vectors because"
7288                              " no conditional operation is available.\n");
7289           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7290         }
7291       else if (reduction_type == FOLD_LEFT_REDUCTION
7292                && reduc_fn == IFN_LAST
7293                && !expand_vec_cond_expr_p (vectype_in,
7294                                            truth_type_for (vectype_in),
7295                                            SSA_NAME))
7296         {
7297           if (dump_enabled_p ())
7298             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7299                              "can't operate on partial vectors because"
7300                              " no conditional operation is available.\n");
7301           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7302         }
7303       else
7304         vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
7305                                vectype_in, NULL);
7306     }
7307   return true;
7308 }
7309
7310 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
7311    value.  */
7312
7313 bool
7314 vect_transform_reduction (loop_vec_info loop_vinfo,
7315                           stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
7316                           gimple **vec_stmt, slp_tree slp_node)
7317 {
7318   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7319   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7320   int i;
7321   int ncopies;
7322   int vec_num;
7323
7324   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7325   gcc_assert (reduc_info->is_reduc_info);
7326
7327   if (nested_in_vect_loop_p (loop, stmt_info))
7328     {
7329       loop = loop->inner;
7330       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
7331     }
7332
7333   gimple_match_op op;
7334   if (!gimple_extract_op (stmt_info->stmt, &op))
7335     gcc_unreachable ();
7336   gcc_assert (op.code.is_tree_code ());
7337   auto code = tree_code (op.code);
7338
7339   /* All uses but the last are expected to be defined in the loop.
7340      The last use is the reduction variable.  In case of nested cycle this
7341      assumption is not true: we use reduc_index to record the index of the
7342      reduction variable.  */
7343   stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
7344   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
7345   int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
7346   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7347
7348   if (slp_node)
7349     {
7350       ncopies = 1;
7351       vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7352     }
7353   else
7354     {
7355       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7356       vec_num = 1;
7357     }
7358
7359   internal_fn cond_fn = get_conditional_internal_fn (code);
7360   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7361   bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
7362
7363   /* Transform.  */
7364   tree new_temp = NULL_TREE;
7365   auto_vec<tree> vec_oprnds0;
7366   auto_vec<tree> vec_oprnds1;
7367   auto_vec<tree> vec_oprnds2;
7368   tree def0;
7369
7370   if (dump_enabled_p ())
7371     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7372
7373   /* FORNOW: Multiple types are not supported for condition.  */
7374   if (code == COND_EXPR)
7375     gcc_assert (ncopies == 1);
7376
7377   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7378
7379   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7380   if (reduction_type == FOLD_LEFT_REDUCTION)
7381     {
7382       internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
7383       return vectorize_fold_left_reduction
7384           (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, code,
7385            reduc_fn, op.ops, vectype_in, reduc_index, masks);
7386     }
7387
7388   bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
7389   gcc_assert (single_defuse_cycle
7390               || code == DOT_PROD_EXPR
7391               || code == WIDEN_SUM_EXPR
7392               || code == SAD_EXPR);
7393
7394   /* Create the destination vector  */
7395   tree scalar_dest = gimple_assign_lhs (stmt_info->stmt);
7396   tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7397
7398   vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
7399                      single_defuse_cycle && reduc_index == 0
7400                      ? NULL_TREE : op.ops[0], &vec_oprnds0,
7401                      single_defuse_cycle && reduc_index == 1
7402                      ? NULL_TREE : op.ops[1], &vec_oprnds1,
7403                      op.num_ops == 3
7404                      && !(single_defuse_cycle && reduc_index == 2)
7405                      ? op.ops[2] : NULL_TREE, &vec_oprnds2);
7406   if (single_defuse_cycle)
7407     {
7408       gcc_assert (!slp_node);
7409       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7410                                      op.ops[reduc_index],
7411                                      reduc_index == 0 ? &vec_oprnds0
7412                                      : (reduc_index == 1 ? &vec_oprnds1
7413                                         : &vec_oprnds2));
7414     }
7415
7416   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7417     {
7418       gimple *new_stmt;
7419       tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7420       if (masked_loop_p && !mask_by_cond_expr)
7421         {
7422           /* Make sure that the reduction accumulator is vop[0].  */
7423           if (reduc_index == 1)
7424             {
7425               gcc_assert (commutative_tree_code (code));
7426               std::swap (vop[0], vop[1]);
7427             }
7428           tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7429                                           vectype_in, i);
7430           gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
7431                                                     vop[0], vop[1], vop[0]);
7432           new_temp = make_ssa_name (vec_dest, call);
7433           gimple_call_set_lhs (call, new_temp);
7434           gimple_call_set_nothrow (call, true);
7435           vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
7436           new_stmt = call;
7437         }
7438       else
7439         {
7440           if (op.num_ops == 3)
7441             vop[2] = vec_oprnds2[i];
7442
7443           if (masked_loop_p && mask_by_cond_expr)
7444             {
7445               tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7446                                               vectype_in, i);
7447               build_vect_cond_expr (code, vop, mask, gsi);
7448             }
7449
7450           new_stmt = gimple_build_assign (vec_dest, code,
7451                                           vop[0], vop[1], vop[2]);
7452           new_temp = make_ssa_name (vec_dest, new_stmt);
7453           gimple_assign_set_lhs (new_stmt, new_temp);
7454           vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7455         }
7456
7457       if (slp_node)
7458         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7459       else if (single_defuse_cycle
7460                && i < ncopies - 1)
7461         {
7462           if (reduc_index == 0)
7463             vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
7464           else if (reduc_index == 1)
7465             vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
7466           else if (reduc_index == 2)
7467             vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
7468         }
7469       else
7470         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
7471     }
7472
7473   if (!slp_node)
7474     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
7475
7476   return true;
7477 }
7478
7479 /* Transform phase of a cycle PHI.  */
7480
7481 bool
7482 vect_transform_cycle_phi (loop_vec_info loop_vinfo,
7483                           stmt_vec_info stmt_info, gimple **vec_stmt,
7484                           slp_tree slp_node, slp_instance slp_node_instance)
7485 {
7486   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7487   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7488   int i;
7489   int ncopies;
7490   int j;
7491   bool nested_cycle = false;
7492   int vec_num;
7493
7494   if (nested_in_vect_loop_p (loop, stmt_info))
7495     {
7496       loop = loop->inner;
7497       nested_cycle = true;
7498     }
7499
7500   stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
7501   reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
7502   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7503   gcc_assert (reduc_info->is_reduc_info);
7504
7505   if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
7506       || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
7507     /* Leave the scalar phi in place.  */
7508     return true;
7509
7510   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7511   /* For a nested cycle we do not fill the above.  */
7512   if (!vectype_in)
7513     vectype_in = STMT_VINFO_VECTYPE (stmt_info);
7514   gcc_assert (vectype_in);
7515
7516   if (slp_node)
7517     {
7518       /* The size vect_schedule_slp_instance computes is off for us.  */
7519       vec_num = vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
7520                                       * SLP_TREE_LANES (slp_node), vectype_in);
7521       ncopies = 1;
7522     }
7523   else
7524     {
7525       vec_num = 1;
7526       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7527     }
7528
7529   /* Check whether we should use a single PHI node and accumulate
7530      vectors to one before the backedge.  */
7531   if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
7532     ncopies = 1;
7533
7534   /* Create the destination vector  */
7535   gphi *phi = as_a <gphi *> (stmt_info->stmt);
7536   tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
7537                                                vectype_out);
7538
7539   /* Get the loop-entry arguments.  */
7540   tree vec_initial_def = NULL_TREE;
7541   auto_vec<tree> vec_initial_defs;
7542   if (slp_node)
7543     {
7544       vec_initial_defs.reserve (vec_num);
7545       if (nested_cycle)
7546         {
7547           unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
7548           vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
7549                              &vec_initial_defs);
7550         }
7551       else
7552         {
7553           gcc_assert (slp_node == slp_node_instance->reduc_phis);
7554           vec<tree> &initial_values = reduc_info->reduc_initial_values;
7555           vec<stmt_vec_info> &stmts = SLP_TREE_SCALAR_STMTS (slp_node);
7556
7557           unsigned int num_phis = stmts.length ();
7558           if (REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
7559             num_phis = 1;
7560           initial_values.reserve (num_phis);
7561           for (unsigned int i = 0; i < num_phis; ++i)
7562             {
7563               gphi *this_phi = as_a<gphi *> (stmts[i]->stmt);
7564               initial_values.quick_push (vect_phi_initial_value (this_phi));
7565             }
7566           if (vec_num == 1)
7567             vect_find_reusable_accumulator (loop_vinfo, reduc_info);
7568           if (!initial_values.is_empty ())
7569             {
7570               tree initial_value
7571                 = (num_phis == 1 ? initial_values[0] : NULL_TREE);
7572               code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
7573               tree neutral_op
7574                 = neutral_op_for_reduction (TREE_TYPE (vectype_out),
7575                                             code, initial_value);
7576               get_initial_defs_for_reduction (loop_vinfo, reduc_info,
7577                                               &vec_initial_defs, vec_num,
7578                                               stmts.length (), neutral_op);
7579             }
7580         }
7581     }
7582   else
7583     {
7584       /* Get at the scalar def before the loop, that defines the initial
7585          value of the reduction variable.  */
7586       tree initial_def = vect_phi_initial_value (phi);
7587       reduc_info->reduc_initial_values.safe_push (initial_def);
7588       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
7589          and we can't use zero for induc_val, use initial_def.  Similarly
7590          for REDUC_MIN and initial_def larger than the base.  */
7591       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
7592         {
7593           tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
7594           if (TREE_CODE (initial_def) == INTEGER_CST
7595               && !integer_zerop (induc_val)
7596               && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
7597                    && tree_int_cst_lt (initial_def, induc_val))
7598                   || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
7599                       && tree_int_cst_lt (induc_val, initial_def))))
7600             {
7601               induc_val = initial_def;
7602               /* Communicate we used the initial_def to epilouge
7603                  generation.  */
7604               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
7605             }
7606           vec_initial_def = build_vector_from_val (vectype_out, induc_val);
7607         }
7608       else if (nested_cycle)
7609         {
7610           /* Do not use an adjustment def as that case is not supported
7611              correctly if ncopies is not one.  */
7612           vect_get_vec_defs_for_operand (loop_vinfo, reduc_stmt_info,
7613                                          ncopies, initial_def,
7614                                          &vec_initial_defs);
7615         }
7616       else if (STMT_VINFO_REDUC_TYPE (reduc_info) == CONST_COND_REDUCTION
7617                || STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
7618         /* Fill the initial vector with the initial scalar value.  */
7619         vec_initial_def
7620           = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info,
7621                                            initial_def, initial_def);
7622       else
7623         {
7624           if (ncopies == 1)
7625             vect_find_reusable_accumulator (loop_vinfo, reduc_info);
7626           if (!reduc_info->reduc_initial_values.is_empty ())
7627             {
7628               initial_def = reduc_info->reduc_initial_values[0];
7629               code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
7630               tree neutral_op
7631                 = neutral_op_for_reduction (TREE_TYPE (initial_def),
7632                                             code, initial_def);
7633               gcc_assert (neutral_op);
7634               /* Try to simplify the vector initialization by applying an
7635                  adjustment after the reduction has been performed.  */
7636               if (!reduc_info->reused_accumulator
7637                   && STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
7638                   && !operand_equal_p (neutral_op, initial_def))
7639                 {
7640                   STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info)
7641                     = initial_def;
7642                   initial_def = neutral_op;
7643                 }
7644               vec_initial_def
7645                 = get_initial_def_for_reduction (loop_vinfo, reduc_info,
7646                                                  initial_def, neutral_op);
7647             }
7648         }
7649     }
7650
7651   if (vec_initial_def)
7652     {
7653       vec_initial_defs.create (ncopies);
7654       for (i = 0; i < ncopies; ++i)
7655         vec_initial_defs.quick_push (vec_initial_def);
7656     }
7657
7658   if (auto *accumulator = reduc_info->reused_accumulator)
7659     {
7660       tree def = accumulator->reduc_input;
7661       if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
7662         {
7663           unsigned int nreduc;
7664           bool res = constant_multiple_p (TYPE_VECTOR_SUBPARTS
7665                                             (TREE_TYPE (def)),
7666                                           TYPE_VECTOR_SUBPARTS (vectype_out),
7667                                           &nreduc);
7668           gcc_assert (res);
7669           gimple_seq stmts = NULL;
7670           /* Reduce the single vector to a smaller one.  */
7671           if (nreduc != 1)
7672             {
7673               /* Perform the reduction in the appropriate type.  */
7674               tree rvectype = vectype_out;
7675               if (!useless_type_conversion_p (TREE_TYPE (vectype_out),
7676                                               TREE_TYPE (TREE_TYPE (def))))
7677                 rvectype = build_vector_type (TREE_TYPE (TREE_TYPE (def)),
7678                                               TYPE_VECTOR_SUBPARTS
7679                                                 (vectype_out));
7680               def = vect_create_partial_epilog (def, rvectype,
7681                                                 STMT_VINFO_REDUC_CODE
7682                                                   (reduc_info),
7683                                                 &stmts);
7684             }
7685           /* The epilogue loop might use a different vector mode, like
7686              VNx2DI vs. V2DI.  */
7687           if (TYPE_MODE (vectype_out) != TYPE_MODE (TREE_TYPE (def)))
7688             {
7689               tree reduc_type = build_vector_type_for_mode
7690                 (TREE_TYPE (TREE_TYPE (def)), TYPE_MODE (vectype_out));
7691               def = gimple_convert (&stmts, reduc_type, def);
7692             }
7693           /* Adjust the input so we pick up the partially reduced value
7694              for the skip edge in vect_create_epilog_for_reduction.  */
7695           accumulator->reduc_input = def;
7696           /* And the reduction could be carried out using a different sign.  */
7697           if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
7698             def = gimple_convert (&stmts, vectype_out, def);
7699           if (loop_vinfo->main_loop_edge)
7700             {
7701               /* While we'd like to insert on the edge this will split
7702                  blocks and disturb bookkeeping, we also will eventually
7703                  need this on the skip edge.  Rely on sinking to
7704                  fixup optimal placement and insert in the pred.  */
7705               gimple_stmt_iterator gsi
7706                 = gsi_last_bb (loop_vinfo->main_loop_edge->src);
7707               /* Insert before a cond that eventually skips the
7708                  epilogue.  */
7709               if (!gsi_end_p (gsi) && stmt_ends_bb_p (gsi_stmt (gsi)))
7710                 gsi_prev (&gsi);
7711               gsi_insert_seq_after (&gsi, stmts, GSI_CONTINUE_LINKING);
7712             }
7713           else
7714             gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop),
7715                                               stmts);
7716         }
7717       if (loop_vinfo->main_loop_edge)
7718         vec_initial_defs[0]
7719           = vect_get_main_loop_result (loop_vinfo, def,
7720                                        vec_initial_defs[0]);
7721       else
7722         vec_initial_defs.safe_push (def);
7723     }
7724
7725   /* Generate the reduction PHIs upfront.  */
7726   for (i = 0; i < vec_num; i++)
7727     {
7728       tree vec_init_def = vec_initial_defs[i];
7729       for (j = 0; j < ncopies; j++)
7730         {
7731           /* Create the reduction-phi that defines the reduction
7732              operand.  */
7733           gphi *new_phi = create_phi_node (vec_dest, loop->header);
7734
7735           /* Set the loop-entry arg of the reduction-phi.  */
7736           if (j != 0 && nested_cycle)
7737             vec_init_def = vec_initial_defs[j];
7738           add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
7739                        UNKNOWN_LOCATION);
7740
7741           /* The loop-latch arg is set in epilogue processing.  */
7742
7743           if (slp_node)
7744             SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
7745           else
7746             {
7747               if (j == 0)
7748                 *vec_stmt = new_phi;
7749               STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
7750             }
7751         }
7752     }
7753
7754   return true;
7755 }
7756
7757 /* Vectorizes LC PHIs.  */
7758
7759 bool
7760 vectorizable_lc_phi (loop_vec_info loop_vinfo,
7761                      stmt_vec_info stmt_info, gimple **vec_stmt,
7762                      slp_tree slp_node)
7763 {
7764   if (!loop_vinfo
7765       || !is_a <gphi *> (stmt_info->stmt)
7766       || gimple_phi_num_args (stmt_info->stmt) != 1)
7767     return false;
7768
7769   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
7770       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
7771     return false;
7772
7773   if (!vec_stmt) /* transformation not required.  */
7774     {
7775       /* Deal with copies from externs or constants that disguise as
7776          loop-closed PHI nodes (PR97886).  */
7777       if (slp_node
7778           && !vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0],
7779                                                 SLP_TREE_VECTYPE (slp_node)))
7780         {
7781           if (dump_enabled_p ())
7782             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7783                              "incompatible vector types for invariants\n");
7784           return false;
7785         }
7786       STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
7787       return true;
7788     }
7789
7790   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7791   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
7792   basic_block bb = gimple_bb (stmt_info->stmt);
7793   edge e = single_pred_edge (bb);
7794   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
7795   auto_vec<tree> vec_oprnds;
7796   vect_get_vec_defs (loop_vinfo, stmt_info, slp_node,
7797                      !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1,
7798                      gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
7799   for (unsigned i = 0; i < vec_oprnds.length (); i++)
7800     {
7801       /* Create the vectorized LC PHI node.  */
7802       gphi *new_phi = create_phi_node (vec_dest, bb);
7803       add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
7804       if (slp_node)
7805         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
7806       else
7807         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
7808     }
7809   if (!slp_node)
7810     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
7811
7812   return true;
7813 }
7814
7815 /* Vectorizes PHIs.  */
7816
7817 bool
7818 vectorizable_phi (vec_info *,
7819                   stmt_vec_info stmt_info, gimple **vec_stmt,
7820                   slp_tree slp_node, stmt_vector_for_cost *cost_vec)
7821 {
7822   if (!is_a <gphi *> (stmt_info->stmt) || !slp_node)
7823     return false;
7824
7825   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
7826     return false;
7827
7828   tree vectype = SLP_TREE_VECTYPE (slp_node);
7829
7830   if (!vec_stmt) /* transformation not required.  */
7831     {
7832       slp_tree child;
7833       unsigned i;
7834       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
7835         if (!child)
7836           {
7837             if (dump_enabled_p ())
7838               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7839                                "PHI node with unvectorized backedge def\n");
7840             return false;
7841           }
7842         else if (!vect_maybe_update_slp_op_vectype (child, vectype))
7843           {
7844             if (dump_enabled_p ())
7845               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7846                                "incompatible vector types for invariants\n");
7847             return false;
7848           }
7849         else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
7850                  && !useless_type_conversion_p (vectype,
7851                                                 SLP_TREE_VECTYPE (child)))
7852           {
7853             /* With bools we can have mask and non-mask precision vectors,
7854                while pattern recog is supposed to guarantee consistency here
7855                bugs in it can cause mismatches (PR103489 for example).
7856                Deal with them here instead of ICEing later.  */
7857             if (dump_enabled_p ())
7858               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7859                                "incompatible vector type setup from "
7860                                "bool pattern detection\n");
7861             gcc_checking_assert
7862               (VECTOR_BOOLEAN_TYPE_P (SLP_TREE_VECTYPE (child))
7863                != VECTOR_BOOLEAN_TYPE_P (vectype));
7864             return false;
7865           }
7866
7867       /* For single-argument PHIs assume coalescing which means zero cost
7868          for the scalar and the vector PHIs.  This avoids artificially
7869          favoring the vector path (but may pessimize it in some cases).  */
7870       if (gimple_phi_num_args (as_a <gphi *> (stmt_info->stmt)) > 1)
7871         record_stmt_cost (cost_vec, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
7872                           vector_stmt, stmt_info, vectype, 0, vect_body);
7873       STMT_VINFO_TYPE (stmt_info) = phi_info_type;
7874       return true;
7875     }
7876
7877   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
7878   basic_block bb = gimple_bb (stmt_info->stmt);
7879   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
7880   auto_vec<gphi *> new_phis;
7881   for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i)
7882     {
7883       slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
7884
7885       /* Skip not yet vectorized defs.  */
7886       if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
7887           && SLP_TREE_VEC_STMTS (child).is_empty ())
7888         continue;
7889
7890       auto_vec<tree> vec_oprnds;
7891       vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
7892       if (!new_phis.exists ())
7893         {
7894           new_phis.create (vec_oprnds.length ());
7895           for (unsigned j = 0; j < vec_oprnds.length (); j++)
7896             {
7897               /* Create the vectorized LC PHI node.  */
7898               new_phis.quick_push (create_phi_node (vec_dest, bb));
7899               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phis[j]);
7900             }
7901         }
7902       edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i);
7903       for (unsigned j = 0; j < vec_oprnds.length (); j++)
7904         add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
7905     }
7906   /* We should have at least one already vectorized child.  */
7907   gcc_assert (new_phis.exists ());
7908
7909   return true;
7910 }
7911
7912 /* Return true if VECTYPE represents a vector that requires lowering
7913    by the vector lowering pass.  */
7914
7915 bool
7916 vect_emulated_vector_p (tree vectype)
7917 {
7918   return (!VECTOR_MODE_P (TYPE_MODE (vectype))
7919           && (!VECTOR_BOOLEAN_TYPE_P (vectype)
7920               || TYPE_PRECISION (TREE_TYPE (vectype)) != 1));
7921 }
7922
7923 /* Return true if we can emulate CODE on an integer mode representation
7924    of a vector.  */
7925
7926 bool
7927 vect_can_vectorize_without_simd_p (tree_code code)
7928 {
7929   switch (code)
7930     {
7931     case PLUS_EXPR:
7932     case MINUS_EXPR:
7933     case NEGATE_EXPR:
7934     case BIT_AND_EXPR:
7935     case BIT_IOR_EXPR:
7936     case BIT_XOR_EXPR:
7937     case BIT_NOT_EXPR:
7938       return true;
7939
7940     default:
7941       return false;
7942     }
7943 }
7944
7945 /* Likewise, but taking a code_helper.  */
7946
7947 bool
7948 vect_can_vectorize_without_simd_p (code_helper code)
7949 {
7950   return (code.is_tree_code ()
7951           && vect_can_vectorize_without_simd_p (tree_code (code)));
7952 }
7953
7954 /* Function vectorizable_induction
7955
7956    Check if STMT_INFO performs an induction computation that can be vectorized.
7957    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7958    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7959    Return true if STMT_INFO is vectorizable in this way.  */
7960
7961 bool
7962 vectorizable_induction (loop_vec_info loop_vinfo,
7963                         stmt_vec_info stmt_info,
7964                         gimple **vec_stmt, slp_tree slp_node,
7965                         stmt_vector_for_cost *cost_vec)
7966 {
7967   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7968   unsigned ncopies;
7969   bool nested_in_vect_loop = false;
7970   class loop *iv_loop;
7971   tree vec_def;
7972   edge pe = loop_preheader_edge (loop);
7973   basic_block new_bb;
7974   tree new_vec, vec_init, vec_step, t;
7975   tree new_name;
7976   gimple *new_stmt;
7977   gphi *induction_phi;
7978   tree induc_def, vec_dest;
7979   tree init_expr, step_expr;
7980   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7981   unsigned i;
7982   tree expr;
7983   gimple_stmt_iterator si;
7984
7985   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
7986   if (!phi)
7987     return false;
7988
7989   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7990     return false;
7991
7992   /* Make sure it was recognized as induction computation.  */
7993   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7994     return false;
7995
7996   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7997   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7998
7999   if (slp_node)
8000     ncopies = 1;
8001   else
8002     ncopies = vect_get_num_copies (loop_vinfo, vectype);
8003   gcc_assert (ncopies >= 1);
8004
8005   /* FORNOW. These restrictions should be relaxed.  */
8006   if (nested_in_vect_loop_p (loop, stmt_info))
8007     {
8008       imm_use_iterator imm_iter;
8009       use_operand_p use_p;
8010       gimple *exit_phi;
8011       edge latch_e;
8012       tree loop_arg;
8013
8014       if (ncopies > 1)
8015         {
8016           if (dump_enabled_p ())
8017             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8018                              "multiple types in nested loop.\n");
8019           return false;
8020         }
8021
8022       exit_phi = NULL;
8023       latch_e = loop_latch_edge (loop->inner);
8024       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
8025       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
8026         {
8027           gimple *use_stmt = USE_STMT (use_p);
8028           if (is_gimple_debug (use_stmt))
8029             continue;
8030
8031           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
8032             {
8033               exit_phi = use_stmt;
8034               break;
8035             }
8036         }
8037       if (exit_phi)
8038         {
8039           stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
8040           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
8041                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
8042             {
8043               if (dump_enabled_p ())
8044                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8045                                  "inner-loop induction only used outside "
8046                                  "of the outer vectorized loop.\n");
8047               return false;
8048             }
8049         }
8050
8051       nested_in_vect_loop = true;
8052       iv_loop = loop->inner;
8053     }
8054   else
8055     iv_loop = loop;
8056   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
8057
8058   if (slp_node && !nunits.is_constant ())
8059     {
8060       /* The current SLP code creates the step value element-by-element.  */
8061       if (dump_enabled_p ())
8062         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8063                          "SLP induction not supported for variable-length"
8064                          " vectors.\n");
8065       return false;
8066     }
8067
8068   if (!vec_stmt) /* transformation not required.  */
8069     {
8070       unsigned inside_cost = 0, prologue_cost = 0;
8071       if (slp_node)
8072         {
8073           /* We eventually need to set a vector type on invariant
8074              arguments.  */
8075           unsigned j;
8076           slp_tree child;
8077           FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
8078             if (!vect_maybe_update_slp_op_vectype
8079                 (child, SLP_TREE_VECTYPE (slp_node)))
8080               {
8081                 if (dump_enabled_p ())
8082                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8083                                    "incompatible vector types for "
8084                                    "invariants\n");
8085                 return false;
8086               }
8087           /* loop cost for vec_loop.  */
8088           inside_cost
8089             = record_stmt_cost (cost_vec,
8090                                 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
8091                                 vector_stmt, stmt_info, 0, vect_body);
8092           /* prologue cost for vec_init (if not nested) and step.  */
8093           prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
8094                                             scalar_to_vec,
8095                                             stmt_info, 0, vect_prologue);
8096         }
8097       else /* if (!slp_node) */
8098         {
8099           /* loop cost for vec_loop.  */
8100           inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
8101                                           stmt_info, 0, vect_body);
8102           /* prologue cost for vec_init and vec_step.  */
8103           prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
8104                                             stmt_info, 0, vect_prologue);
8105         }
8106       if (dump_enabled_p ())
8107         dump_printf_loc (MSG_NOTE, vect_location,
8108                          "vect_model_induction_cost: inside_cost = %d, "
8109                          "prologue_cost = %d .\n", inside_cost,
8110                          prologue_cost);
8111
8112       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
8113       DUMP_VECT_SCOPE ("vectorizable_induction");
8114       return true;
8115     }
8116
8117   /* Transform.  */
8118
8119   /* Compute a vector variable, initialized with the first VF values of
8120      the induction variable.  E.g., for an iv with IV_PHI='X' and
8121      evolution S, for a vector of 4 units, we want to compute:
8122      [X, X + S, X + 2*S, X + 3*S].  */
8123
8124   if (dump_enabled_p ())
8125     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
8126
8127   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
8128   gcc_assert (step_expr != NULL_TREE);
8129   tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
8130
8131   pe = loop_preheader_edge (iv_loop);
8132   /* Find the first insertion point in the BB.  */
8133   basic_block bb = gimple_bb (phi);
8134   si = gsi_after_labels (bb);
8135
8136   /* For SLP induction we have to generate several IVs as for example
8137      with group size 3 we need
8138        [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
8139        [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2].  */
8140   if (slp_node)
8141     {
8142       /* Enforced above.  */
8143       unsigned int const_nunits = nunits.to_constant ();
8144
8145       /* The initial values are vectorized, but any lanes > group_size
8146          need adjustment.  */
8147       slp_tree init_node
8148         = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
8149
8150       /* Gather steps.  Since we do not vectorize inductions as
8151          cycles we have to reconstruct the step from SCEV data.  */
8152       unsigned group_size = SLP_TREE_LANES (slp_node);
8153       tree *steps = XALLOCAVEC (tree, group_size);
8154       tree *inits = XALLOCAVEC (tree, group_size);
8155       stmt_vec_info phi_info;
8156       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
8157         {
8158           steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
8159           if (!init_node)
8160             inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt),
8161                                            pe->dest_idx);
8162         }
8163
8164       /* Now generate the IVs.  */
8165       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8166       gcc_assert ((const_nunits * nvects) % group_size == 0);
8167       unsigned nivs;
8168       if (nested_in_vect_loop)
8169         nivs = nvects;
8170       else
8171         {
8172           /* Compute the number of distinct IVs we need.  First reduce
8173              group_size if it is a multiple of const_nunits so we get
8174              one IV for a group_size of 4 but const_nunits 2.  */
8175           unsigned group_sizep = group_size;
8176           if (group_sizep % const_nunits == 0)
8177             group_sizep = group_sizep / const_nunits;
8178           nivs = least_common_multiple (group_sizep,
8179                                         const_nunits) / const_nunits;
8180         }
8181       tree stept = TREE_TYPE (step_vectype);
8182       tree lupdate_mul = NULL_TREE;
8183       if (!nested_in_vect_loop)
8184         {
8185           /* The number of iterations covered in one vector iteration.  */
8186           unsigned lup_mul = (nvects * const_nunits) / group_size;
8187           lupdate_mul
8188             = build_vector_from_val (step_vectype,
8189                                      SCALAR_FLOAT_TYPE_P (stept)
8190                                      ? build_real_from_wide (stept, lup_mul,
8191                                                              UNSIGNED)
8192                                      : build_int_cstu (stept, lup_mul));
8193         }
8194       tree peel_mul = NULL_TREE;
8195       gimple_seq init_stmts = NULL;
8196       if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
8197         {
8198           if (SCALAR_FLOAT_TYPE_P (stept))
8199             peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept,
8200                                      LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
8201           else
8202             peel_mul = gimple_convert (&init_stmts, stept,
8203                                        LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
8204           peel_mul = gimple_build_vector_from_val (&init_stmts,
8205                                                    step_vectype, peel_mul);
8206         }
8207       unsigned ivn;
8208       auto_vec<tree> vec_steps;
8209       for (ivn = 0; ivn < nivs; ++ivn)
8210         {
8211           tree_vector_builder step_elts (step_vectype, const_nunits, 1);
8212           tree_vector_builder init_elts (vectype, const_nunits, 1);
8213           tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
8214           for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
8215             {
8216               /* The scalar steps of the IVs.  */
8217               tree elt = steps[(ivn*const_nunits + eltn) % group_size];
8218               elt = gimple_convert (&init_stmts, TREE_TYPE (step_vectype), elt);
8219               step_elts.quick_push (elt);
8220               if (!init_node)
8221                 {
8222                   /* The scalar inits of the IVs if not vectorized.  */
8223                   elt = inits[(ivn*const_nunits + eltn) % group_size];
8224                   if (!useless_type_conversion_p (TREE_TYPE (vectype),
8225                                                   TREE_TYPE (elt)))
8226                     elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
8227                                         TREE_TYPE (vectype), elt);
8228                   init_elts.quick_push (elt);
8229                 }
8230               /* The number of steps to add to the initial values.  */
8231               unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
8232               mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
8233                                    ? build_real_from_wide (stept,
8234                                                            mul_elt, UNSIGNED)
8235                                    : build_int_cstu (stept, mul_elt));
8236             }
8237           vec_step = gimple_build_vector (&init_stmts, &step_elts);
8238           vec_steps.safe_push (vec_step);
8239           tree step_mul = gimple_build_vector (&init_stmts, &mul_elts);
8240           if (peel_mul)
8241             step_mul = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
8242                                      step_mul, peel_mul);
8243           if (!init_node)
8244             vec_init = gimple_build_vector (&init_stmts, &init_elts);
8245
8246           /* Create the induction-phi that defines the induction-operand.  */
8247           vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
8248                                             "vec_iv_");
8249           induction_phi = create_phi_node (vec_dest, iv_loop->header);
8250           induc_def = PHI_RESULT (induction_phi);
8251
8252           /* Create the iv update inside the loop  */
8253           tree up = vec_step;
8254           if (lupdate_mul)
8255             up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
8256                                vec_step, lupdate_mul);
8257           gimple_seq stmts = NULL;
8258           vec_def = gimple_convert (&stmts, step_vectype, induc_def);
8259           vec_def = gimple_build (&stmts,
8260                                   PLUS_EXPR, step_vectype, vec_def, up);
8261           vec_def = gimple_convert (&stmts, vectype, vec_def);
8262           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8263           add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
8264                        UNKNOWN_LOCATION);
8265
8266           if (init_node)
8267             vec_init = vect_get_slp_vect_def (init_node, ivn);
8268           if (!nested_in_vect_loop
8269               && !integer_zerop (step_mul))
8270             {
8271               vec_def = gimple_convert (&init_stmts, step_vectype, vec_init);
8272               up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
8273                                  vec_step, step_mul);
8274               vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
8275                                       vec_def, up);
8276               vec_init = gimple_convert (&init_stmts, vectype, vec_def);
8277             }
8278
8279           /* Set the arguments of the phi node:  */
8280           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
8281
8282           SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
8283         }
8284       if (!nested_in_vect_loop)
8285         {
8286           /* Fill up to the number of vectors we need for the whole group.  */
8287           nivs = least_common_multiple (group_size,
8288                                         const_nunits) / const_nunits;
8289           vec_steps.reserve (nivs-ivn);
8290           for (; ivn < nivs; ++ivn)
8291             {
8292               SLP_TREE_VEC_STMTS (slp_node)
8293                 .quick_push (SLP_TREE_VEC_STMTS (slp_node)[0]);
8294               vec_steps.quick_push (vec_steps[0]);
8295             }
8296         }
8297
8298       /* Re-use IVs when we can.  We are generating further vector
8299          stmts by adding VF' * stride to the IVs generated above.  */
8300       if (ivn < nvects)
8301         {
8302           unsigned vfp
8303             = least_common_multiple (group_size, const_nunits) / group_size;
8304           tree lupdate_mul
8305             = build_vector_from_val (step_vectype,
8306                                      SCALAR_FLOAT_TYPE_P (stept)
8307                                      ? build_real_from_wide (stept,
8308                                                              vfp, UNSIGNED)
8309                                      : build_int_cstu (stept, vfp));
8310           for (; ivn < nvects; ++ivn)
8311             {
8312               gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
8313               tree def = gimple_get_lhs (iv);
8314               if (ivn < 2*nivs)
8315                 vec_steps[ivn - nivs]
8316                   = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
8317                                   vec_steps[ivn - nivs], lupdate_mul);
8318               gimple_seq stmts = NULL;
8319               def = gimple_convert (&stmts, step_vectype, def);
8320               def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
8321                                   def, vec_steps[ivn % nivs]);
8322               def = gimple_convert (&stmts, vectype, def);
8323               if (gimple_code (iv) == GIMPLE_PHI)
8324                 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8325               else
8326                 {
8327                   gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
8328                   gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
8329                 }
8330               SLP_TREE_VEC_STMTS (slp_node)
8331                 .quick_push (SSA_NAME_DEF_STMT (def));
8332             }
8333         }
8334
8335       new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts);
8336       gcc_assert (!new_bb);
8337
8338       return true;
8339     }
8340
8341   init_expr = vect_phi_initial_value (phi);
8342
8343   gimple_seq stmts = NULL;
8344   if (!nested_in_vect_loop)
8345     {
8346       /* Convert the initial value to the IV update type.  */
8347       tree new_type = TREE_TYPE (step_expr);
8348       init_expr = gimple_convert (&stmts, new_type, init_expr);
8349
8350       /* If we are using the loop mask to "peel" for alignment then we need
8351          to adjust the start value here.  */
8352       tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
8353       if (skip_niters != NULL_TREE)
8354         {
8355           if (FLOAT_TYPE_P (vectype))
8356             skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
8357                                         skip_niters);
8358           else
8359             skip_niters = gimple_convert (&stmts, new_type, skip_niters);
8360           tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
8361                                          skip_niters, step_expr);
8362           init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
8363                                     init_expr, skip_step);
8364         }
8365     }
8366
8367   if (stmts)
8368     {
8369       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
8370       gcc_assert (!new_bb);
8371     }
8372
8373   /* Create the vector that holds the initial_value of the induction.  */
8374   if (nested_in_vect_loop)
8375     {
8376       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
8377          been created during vectorization of previous stmts.  We obtain it
8378          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
8379       auto_vec<tree> vec_inits;
8380       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
8381                                      init_expr, &vec_inits);
8382       vec_init = vec_inits[0];
8383       /* If the initial value is not of proper type, convert it.  */
8384       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
8385         {
8386           new_stmt
8387             = gimple_build_assign (vect_get_new_ssa_name (vectype,
8388                                                           vect_simple_var,
8389                                                           "vec_iv_"),
8390                                    VIEW_CONVERT_EXPR,
8391                                    build1 (VIEW_CONVERT_EXPR, vectype,
8392                                            vec_init));
8393           vec_init = gimple_assign_lhs (new_stmt);
8394           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
8395                                                  new_stmt);
8396           gcc_assert (!new_bb);
8397         }
8398     }
8399   else
8400     {
8401       /* iv_loop is the loop to be vectorized. Create:
8402          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
8403       stmts = NULL;
8404       new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
8405
8406       unsigned HOST_WIDE_INT const_nunits;
8407       if (nunits.is_constant (&const_nunits))
8408         {
8409           tree_vector_builder elts (step_vectype, const_nunits, 1);
8410           elts.quick_push (new_name);
8411           for (i = 1; i < const_nunits; i++)
8412             {
8413               /* Create: new_name_i = new_name + step_expr  */
8414               new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
8415                                        new_name, step_expr);
8416               elts.quick_push (new_name);
8417             }
8418           /* Create a vector from [new_name_0, new_name_1, ...,
8419              new_name_nunits-1]  */
8420           vec_init = gimple_build_vector (&stmts, &elts);
8421         }
8422       else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
8423         /* Build the initial value directly from a VEC_SERIES_EXPR.  */
8424         vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
8425                                  new_name, step_expr);
8426       else
8427         {
8428           /* Build:
8429                 [base, base, base, ...]
8430                 + (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
8431           gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
8432           gcc_assert (flag_associative_math);
8433           tree index = build_index_vector (step_vectype, 0, 1);
8434           tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
8435                                                         new_name);
8436           tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
8437                                                         step_expr);
8438           vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
8439           vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
8440                                    vec_init, step_vec);
8441           vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
8442                                    vec_init, base_vec);
8443         }
8444       vec_init = gimple_convert (&stmts, vectype, vec_init);
8445
8446       if (stmts)
8447         {
8448           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
8449           gcc_assert (!new_bb);
8450         }
8451     }
8452
8453
8454   /* Create the vector that holds the step of the induction.  */
8455   if (nested_in_vect_loop)
8456     /* iv_loop is nested in the loop to be vectorized. Generate:
8457        vec_step = [S, S, S, S]  */
8458     new_name = step_expr;
8459   else
8460     {
8461       /* iv_loop is the loop to be vectorized. Generate:
8462           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
8463       gimple_seq seq = NULL;
8464       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
8465         {
8466           expr = build_int_cst (integer_type_node, vf);
8467           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
8468         }
8469       else
8470         expr = build_int_cst (TREE_TYPE (step_expr), vf);
8471       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
8472                                expr, step_expr);
8473       if (seq)
8474         {
8475           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
8476           gcc_assert (!new_bb);
8477         }
8478     }
8479
8480   t = unshare_expr (new_name);
8481   gcc_assert (CONSTANT_CLASS_P (new_name)
8482               || TREE_CODE (new_name) == SSA_NAME);
8483   new_vec = build_vector_from_val (step_vectype, t);
8484   vec_step = vect_init_vector (loop_vinfo, stmt_info,
8485                                new_vec, step_vectype, NULL);
8486
8487
8488   /* Create the following def-use cycle:
8489      loop prolog:
8490          vec_init = ...
8491          vec_step = ...
8492      loop:
8493          vec_iv = PHI <vec_init, vec_loop>
8494          ...
8495          STMT
8496          ...
8497          vec_loop = vec_iv + vec_step;  */
8498
8499   /* Create the induction-phi that defines the induction-operand.  */
8500   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
8501   induction_phi = create_phi_node (vec_dest, iv_loop->header);
8502   induc_def = PHI_RESULT (induction_phi);
8503
8504   /* Create the iv update inside the loop  */
8505   stmts = NULL;
8506   vec_def = gimple_convert (&stmts, step_vectype, induc_def);
8507   vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
8508   vec_def = gimple_convert (&stmts, vectype, vec_def);
8509   gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8510   new_stmt = SSA_NAME_DEF_STMT (vec_def);
8511
8512   /* Set the arguments of the phi node:  */
8513   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
8514   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
8515                UNKNOWN_LOCATION);
8516
8517   STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
8518   *vec_stmt = induction_phi;
8519
8520   /* In case that vectorization factor (VF) is bigger than the number
8521      of elements that we can fit in a vectype (nunits), we have to generate
8522      more than one vector stmt - i.e - we need to "unroll" the
8523      vector stmt by a factor VF/nunits.  For more details see documentation
8524      in vectorizable_operation.  */
8525
8526   if (ncopies > 1)
8527     {
8528       gimple_seq seq = NULL;
8529       /* FORNOW. This restriction should be relaxed.  */
8530       gcc_assert (!nested_in_vect_loop);
8531
8532       /* Create the vector that holds the step of the induction.  */
8533       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
8534         {
8535           expr = build_int_cst (integer_type_node, nunits);
8536           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
8537         }
8538       else
8539         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
8540       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
8541                                expr, step_expr);
8542       if (seq)
8543         {
8544           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
8545           gcc_assert (!new_bb);
8546         }
8547
8548       t = unshare_expr (new_name);
8549       gcc_assert (CONSTANT_CLASS_P (new_name)
8550                   || TREE_CODE (new_name) == SSA_NAME);
8551       new_vec = build_vector_from_val (step_vectype, t);
8552       vec_step = vect_init_vector (loop_vinfo, stmt_info,
8553                                    new_vec, step_vectype, NULL);
8554
8555       vec_def = induc_def;
8556       for (i = 1; i < ncopies; i++)
8557         {
8558           /* vec_i = vec_prev + vec_step  */
8559           gimple_seq stmts = NULL;
8560           vec_def = gimple_convert (&stmts, step_vectype, vec_def);
8561           vec_def = gimple_build (&stmts,
8562                                   PLUS_EXPR, step_vectype, vec_def, vec_step);
8563           vec_def = gimple_convert (&stmts, vectype, vec_def);
8564
8565           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8566           new_stmt = SSA_NAME_DEF_STMT (vec_def);
8567           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
8568         }
8569     }
8570
8571   if (dump_enabled_p ())
8572     dump_printf_loc (MSG_NOTE, vect_location,
8573                      "transform induction: created def-use cycle: %G%G",
8574                      induction_phi, SSA_NAME_DEF_STMT (vec_def));
8575
8576   return true;
8577 }
8578
8579 /* Function vectorizable_live_operation.
8580
8581    STMT_INFO computes a value that is used outside the loop.  Check if
8582    it can be supported.  */
8583
8584 bool
8585 vectorizable_live_operation (vec_info *vinfo,
8586                              stmt_vec_info stmt_info,
8587                              gimple_stmt_iterator *gsi,
8588                              slp_tree slp_node, slp_instance slp_node_instance,
8589                              int slp_index, bool vec_stmt_p,
8590                              stmt_vector_for_cost *cost_vec)
8591 {
8592   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
8593   imm_use_iterator imm_iter;
8594   tree lhs, lhs_type, bitsize;
8595   tree vectype = (slp_node
8596                   ? SLP_TREE_VECTYPE (slp_node)
8597                   : STMT_VINFO_VECTYPE (stmt_info));
8598   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8599   int ncopies;
8600   gimple *use_stmt;
8601   auto_vec<tree> vec_oprnds;
8602   int vec_entry = 0;
8603   poly_uint64 vec_index = 0;
8604
8605   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
8606
8607   /* If a stmt of a reduction is live, vectorize it via
8608      vect_create_epilog_for_reduction.  vectorizable_reduction assessed
8609      validity so just trigger the transform here.  */
8610   if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
8611     {
8612       if (!vec_stmt_p)
8613         return true;
8614       if (slp_node)
8615         {
8616           /* For reduction chains the meta-info is attached to
8617              the group leader.  */
8618           if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
8619             stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
8620           /* For SLP reductions we vectorize the epilogue for
8621              all involved stmts together.  */
8622           else if (slp_index != 0)
8623             return true;
8624           else
8625             /* For SLP reductions the meta-info is attached to
8626                the representative.  */
8627             stmt_info = SLP_TREE_REPRESENTATIVE (slp_node);
8628         }
8629       stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8630       gcc_assert (reduc_info->is_reduc_info);
8631       if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
8632           || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
8633         return true;
8634       vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
8635                                         slp_node_instance);
8636       return true;
8637     }
8638
8639   /* If STMT is not relevant and it is a simple assignment and its inputs are
8640      invariant then it can remain in place, unvectorized.  The original last
8641      scalar value that it computes will be used.  */
8642   if (!STMT_VINFO_RELEVANT_P (stmt_info))
8643     {
8644       gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
8645       if (dump_enabled_p ())
8646         dump_printf_loc (MSG_NOTE, vect_location,
8647                          "statement is simple and uses invariant.  Leaving in "
8648                          "place.\n");
8649       return true;
8650     }
8651
8652   if (slp_node)
8653     ncopies = 1;
8654   else
8655     ncopies = vect_get_num_copies (loop_vinfo, vectype);
8656
8657   if (slp_node)
8658     {
8659       gcc_assert (slp_index >= 0);
8660
8661       /* Get the last occurrence of the scalar index from the concatenation of
8662          all the slp vectors. Calculate which slp vector it is and the index
8663          within.  */
8664       int num_scalar = SLP_TREE_LANES (slp_node);
8665       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8666       poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
8667
8668       /* Calculate which vector contains the result, and which lane of
8669          that vector we need.  */
8670       if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
8671         {
8672           if (dump_enabled_p ())
8673             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8674                              "Cannot determine which vector holds the"
8675                              " final result.\n");
8676           return false;
8677         }
8678     }
8679
8680   if (!vec_stmt_p)
8681     {
8682       /* No transformation required.  */
8683       if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
8684         {
8685           if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
8686                                                OPTIMIZE_FOR_SPEED))
8687             {
8688               if (dump_enabled_p ())
8689                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8690                                  "can't operate on partial vectors "
8691                                  "because the target doesn't support extract "
8692                                  "last reduction.\n");
8693               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8694             }
8695           else if (slp_node)
8696             {
8697               if (dump_enabled_p ())
8698                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8699                                  "can't operate on partial vectors "
8700                                  "because an SLP statement is live after "
8701                                  "the loop.\n");
8702               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8703             }
8704           else if (ncopies > 1)
8705             {
8706               if (dump_enabled_p ())
8707                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8708                                  "can't operate on partial vectors "
8709                                  "because ncopies is greater than 1.\n");
8710               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8711             }
8712           else
8713             {
8714               gcc_assert (ncopies == 1 && !slp_node);
8715               vect_record_loop_mask (loop_vinfo,
8716                                      &LOOP_VINFO_MASKS (loop_vinfo),
8717                                      1, vectype, NULL);
8718             }
8719         }
8720       /* ???  Enable for loop costing as well.  */
8721       if (!loop_vinfo)
8722         record_stmt_cost (cost_vec, 1, vec_to_scalar, stmt_info, NULL_TREE,
8723                           0, vect_epilogue);
8724       return true;
8725     }
8726
8727   /* Use the lhs of the original scalar statement.  */
8728   gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
8729   if (dump_enabled_p ())
8730     dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
8731                      "stmt %G", stmt);
8732
8733   lhs = gimple_get_lhs (stmt);
8734   lhs_type = TREE_TYPE (lhs);
8735
8736   bitsize = vector_element_bits_tree (vectype);
8737
8738   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
8739   tree vec_lhs, bitstart;
8740   gimple *vec_stmt;
8741   if (slp_node)
8742     {
8743       gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8744
8745       /* Get the correct slp vectorized stmt.  */
8746       vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry];
8747       vec_lhs = gimple_get_lhs (vec_stmt);
8748
8749       /* Get entry to use.  */
8750       bitstart = bitsize_int (vec_index);
8751       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
8752     }
8753   else
8754     {
8755       /* For multiple copies, get the last copy.  */
8756       vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info).last ();
8757       vec_lhs = gimple_get_lhs (vec_stmt);
8758
8759       /* Get the last lane in the vector.  */
8760       bitstart = int_const_binop (MULT_EXPR, bitsize, bitsize_int (nunits - 1));
8761     }
8762
8763   if (loop_vinfo)
8764     {
8765       /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
8766          requirement, insert one phi node for it.  It looks like:
8767            loop;
8768          BB:
8769            # lhs' = PHI <lhs>
8770          ==>
8771            loop;
8772          BB:
8773            # vec_lhs' = PHI <vec_lhs>
8774            new_tree = lane_extract <vec_lhs', ...>;
8775            lhs' = new_tree;  */
8776
8777       class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8778       basic_block exit_bb = single_exit (loop)->dest;
8779       gcc_assert (single_pred_p (exit_bb));
8780
8781       tree vec_lhs_phi = copy_ssa_name (vec_lhs);
8782       gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
8783       SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, vec_lhs);
8784
8785       gimple_seq stmts = NULL;
8786       tree new_tree;
8787       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8788         {
8789           /* Emit:
8790
8791                SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
8792
8793              where VEC_LHS is the vectorized live-out result and MASK is
8794              the loop mask for the final iteration.  */
8795           gcc_assert (ncopies == 1 && !slp_node);
8796           tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
8797           tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
8798                                           1, vectype, 0);
8799           tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
8800                                           mask, vec_lhs_phi);
8801
8802           /* Convert the extracted vector element to the scalar type.  */
8803           new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
8804         }
8805       else
8806         {
8807           tree bftype = TREE_TYPE (vectype);
8808           if (VECTOR_BOOLEAN_TYPE_P (vectype))
8809             bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8810           new_tree = build3 (BIT_FIELD_REF, bftype,
8811                              vec_lhs_phi, bitsize, bitstart);
8812           new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8813                                            &stmts, true, NULL_TREE);
8814         }
8815
8816       if (stmts)
8817         {
8818           gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb);
8819           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
8820
8821           /* Remove existing phi from lhs and create one copy from new_tree.  */
8822           tree lhs_phi = NULL_TREE;
8823           gimple_stmt_iterator gsi;
8824           for (gsi = gsi_start_phis (exit_bb);
8825                !gsi_end_p (gsi); gsi_next (&gsi))
8826             {
8827               gimple *phi = gsi_stmt (gsi);
8828               if ((gimple_phi_arg_def (phi, 0) == lhs))
8829                 {
8830                   remove_phi_node (&gsi, false);
8831                   lhs_phi = gimple_phi_result (phi);
8832                   gimple *copy = gimple_build_assign (lhs_phi, new_tree);
8833                   gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
8834                   break;
8835                 }
8836             }
8837         }
8838
8839       /* Replace use of lhs with newly computed result.  If the use stmt is a
8840          single arg PHI, just replace all uses of PHI result.  It's necessary
8841          because lcssa PHI defining lhs may be before newly inserted stmt.  */
8842       use_operand_p use_p;
8843       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8844         if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
8845             && !is_gimple_debug (use_stmt))
8846           {
8847             if (gimple_code (use_stmt) == GIMPLE_PHI
8848                 && gimple_phi_num_args (use_stmt) == 1)
8849               {
8850                 replace_uses_by (gimple_phi_result (use_stmt), new_tree);
8851               }
8852             else
8853               {
8854                 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8855                     SET_USE (use_p, new_tree);
8856               }
8857             update_stmt (use_stmt);
8858           }
8859     }
8860   else
8861     {
8862       /* For basic-block vectorization simply insert the lane-extraction.  */
8863       tree bftype = TREE_TYPE (vectype);
8864       if (VECTOR_BOOLEAN_TYPE_P (vectype))
8865         bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8866       tree new_tree = build3 (BIT_FIELD_REF, bftype,
8867                               vec_lhs, bitsize, bitstart);
8868       gimple_seq stmts = NULL;
8869       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8870                                        &stmts, true, NULL_TREE);
8871       if (TREE_CODE (new_tree) == SSA_NAME
8872           && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
8873         SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1;
8874       if (is_a <gphi *> (vec_stmt))
8875         {
8876           gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt));
8877           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8878         }
8879       else
8880         {
8881           gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
8882           gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
8883         }
8884
8885       /* Replace use of lhs with newly computed result.  If the use stmt is a
8886          single arg PHI, just replace all uses of PHI result.  It's necessary
8887          because lcssa PHI defining lhs may be before newly inserted stmt.  */
8888       use_operand_p use_p;
8889       stmt_vec_info use_stmt_info;
8890       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8891         if (!is_gimple_debug (use_stmt)
8892             && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
8893                 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
8894           {
8895             /* ???  This can happen when the live lane ends up being
8896                used in a vector construction code-generated by an
8897                external SLP node (and code-generation for that already
8898                happened).  See gcc.dg/vect/bb-slp-47.c.
8899                Doing this is what would happen if that vector CTOR
8900                were not code-generated yet so it is not too bad.
8901                ???  In fact we'd likely want to avoid this situation
8902                in the first place.  */
8903             if (TREE_CODE (new_tree) == SSA_NAME
8904                 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
8905                 && gimple_code (use_stmt) != GIMPLE_PHI
8906                 && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
8907                                                 use_stmt))
8908               {
8909                 enum tree_code code = gimple_assign_rhs_code (use_stmt);
8910                 gcc_assert (code == CONSTRUCTOR
8911                             || code == VIEW_CONVERT_EXPR
8912                             || CONVERT_EXPR_CODE_P (code));
8913                 if (dump_enabled_p ())
8914                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8915                                    "Using original scalar computation for "
8916                                    "live lane because use preceeds vector "
8917                                    "def\n");
8918                 continue;
8919               }
8920             /* ???  It can also happen that we end up pulling a def into
8921                a loop where replacing out-of-loop uses would require
8922                a new LC SSA PHI node.  Retain the original scalar in
8923                those cases as well.  PR98064.  */
8924             if (TREE_CODE (new_tree) == SSA_NAME
8925                 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
8926                 && (gimple_bb (use_stmt)->loop_father
8927                     != gimple_bb (vec_stmt)->loop_father)
8928                 && !flow_loop_nested_p (gimple_bb (vec_stmt)->loop_father,
8929                                         gimple_bb (use_stmt)->loop_father))
8930               {
8931                 if (dump_enabled_p ())
8932                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8933                                    "Using original scalar computation for "
8934                                    "live lane because there is an out-of-loop "
8935                                    "definition for it\n");
8936                 continue;
8937               }
8938             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8939               SET_USE (use_p, new_tree);
8940             update_stmt (use_stmt);
8941           }
8942     }
8943
8944   return true;
8945 }
8946
8947 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO.  */
8948
8949 static void
8950 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
8951 {
8952   ssa_op_iter op_iter;
8953   imm_use_iterator imm_iter;
8954   def_operand_p def_p;
8955   gimple *ustmt;
8956
8957   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
8958     {
8959       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
8960         {
8961           basic_block bb;
8962
8963           if (!is_gimple_debug (ustmt))
8964             continue;
8965
8966           bb = gimple_bb (ustmt);
8967
8968           if (!flow_bb_inside_loop_p (loop, bb))
8969             {
8970               if (gimple_debug_bind_p (ustmt))
8971                 {
8972                   if (dump_enabled_p ())
8973                     dump_printf_loc (MSG_NOTE, vect_location,
8974                                      "killing debug use\n");
8975
8976                   gimple_debug_bind_reset_value (ustmt);
8977                   update_stmt (ustmt);
8978                 }
8979               else
8980                 gcc_unreachable ();
8981             }
8982         }
8983     }
8984 }
8985
8986 /* Given loop represented by LOOP_VINFO, return true if computation of
8987    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
8988    otherwise.  */
8989
8990 static bool
8991 loop_niters_no_overflow (loop_vec_info loop_vinfo)
8992 {
8993   /* Constant case.  */
8994   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8995     {
8996       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
8997       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
8998
8999       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
9000       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
9001       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
9002         return true;
9003     }
9004
9005   widest_int max;
9006   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9007   /* Check the upper bound of loop niters.  */
9008   if (get_max_loop_iterations (loop, &max))
9009     {
9010       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
9011       signop sgn = TYPE_SIGN (type);
9012       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
9013       if (max < type_max)
9014         return true;
9015     }
9016   return false;
9017 }
9018
9019 /* Return a mask type with half the number of elements as OLD_TYPE,
9020    given that it should have mode NEW_MODE.  */
9021
9022 tree
9023 vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
9024 {
9025   poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
9026   return build_truth_vector_type_for_mode (nunits, new_mode);
9027 }
9028
9029 /* Return a mask type with twice as many elements as OLD_TYPE,
9030    given that it should have mode NEW_MODE.  */
9031
9032 tree
9033 vect_double_mask_nunits (tree old_type, machine_mode new_mode)
9034 {
9035   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
9036   return build_truth_vector_type_for_mode (nunits, new_mode);
9037 }
9038
9039 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
9040    contain a sequence of NVECTORS masks that each control a vector of type
9041    VECTYPE.  If SCALAR_MASK is nonnull, the fully-masked loop would AND
9042    these vector masks with the vector version of SCALAR_MASK.  */
9043
9044 void
9045 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
9046                        unsigned int nvectors, tree vectype, tree scalar_mask)
9047 {
9048   gcc_assert (nvectors != 0);
9049   if (masks->length () < nvectors)
9050     masks->safe_grow_cleared (nvectors, true);
9051   rgroup_controls *rgm = &(*masks)[nvectors - 1];
9052   /* The number of scalars per iteration and the number of vectors are
9053      both compile-time constants.  */
9054   unsigned int nscalars_per_iter
9055     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
9056                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
9057
9058   if (scalar_mask)
9059     {
9060       scalar_cond_masked_key cond (scalar_mask, nvectors);
9061       loop_vinfo->scalar_cond_masked_set.add (cond);
9062     }
9063
9064   if (rgm->max_nscalars_per_iter < nscalars_per_iter)
9065     {
9066       rgm->max_nscalars_per_iter = nscalars_per_iter;
9067       rgm->type = truth_type_for (vectype);
9068       rgm->factor = 1;
9069     }
9070 }
9071
9072 /* Given a complete set of masks MASKS, extract mask number INDEX
9073    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
9074    where 0 <= INDEX < NVECTORS.  Insert any set-up statements before GSI.
9075
9076    See the comment above vec_loop_masks for more details about the mask
9077    arrangement.  */
9078
9079 tree
9080 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
9081                     unsigned int nvectors, tree vectype, unsigned int index)
9082 {
9083   rgroup_controls *rgm = &(*masks)[nvectors - 1];
9084   tree mask_type = rgm->type;
9085
9086   /* Populate the rgroup's mask array, if this is the first time we've
9087      used it.  */
9088   if (rgm->controls.is_empty ())
9089     {
9090       rgm->controls.safe_grow_cleared (nvectors, true);
9091       for (unsigned int i = 0; i < nvectors; ++i)
9092         {
9093           tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
9094           /* Provide a dummy definition until the real one is available.  */
9095           SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
9096           rgm->controls[i] = mask;
9097         }
9098     }
9099
9100   tree mask = rgm->controls[index];
9101   if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
9102                 TYPE_VECTOR_SUBPARTS (vectype)))
9103     {
9104       /* A loop mask for data type X can be reused for data type Y
9105          if X has N times more elements than Y and if Y's elements
9106          are N times bigger than X's.  In this case each sequence
9107          of N elements in the loop mask will be all-zero or all-one.
9108          We can then view-convert the mask so that each sequence of
9109          N elements is replaced by a single element.  */
9110       gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
9111                               TYPE_VECTOR_SUBPARTS (vectype)));
9112       gimple_seq seq = NULL;
9113       mask_type = truth_type_for (vectype);
9114       mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
9115       if (seq)
9116         gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
9117     }
9118   return mask;
9119 }
9120
9121 /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
9122    lengths for controlling an operation on VECTYPE.  The operation splits
9123    each element of VECTYPE into FACTOR separate subelements, measuring the
9124    length as a number of these subelements.  */
9125
9126 void
9127 vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
9128                       unsigned int nvectors, tree vectype, unsigned int factor)
9129 {
9130   gcc_assert (nvectors != 0);
9131   if (lens->length () < nvectors)
9132     lens->safe_grow_cleared (nvectors, true);
9133   rgroup_controls *rgl = &(*lens)[nvectors - 1];
9134
9135   /* The number of scalars per iteration, scalar occupied bytes and
9136      the number of vectors are both compile-time constants.  */
9137   unsigned int nscalars_per_iter
9138     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
9139                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
9140
9141   if (rgl->max_nscalars_per_iter < nscalars_per_iter)
9142     {
9143       /* For now, we only support cases in which all loads and stores fall back
9144          to VnQI or none do.  */
9145       gcc_assert (!rgl->max_nscalars_per_iter
9146                   || (rgl->factor == 1 && factor == 1)
9147                   || (rgl->max_nscalars_per_iter * rgl->factor
9148                       == nscalars_per_iter * factor));
9149       rgl->max_nscalars_per_iter = nscalars_per_iter;
9150       rgl->type = vectype;
9151       rgl->factor = factor;
9152     }
9153 }
9154
9155 /* Given a complete set of length LENS, extract length number INDEX for an
9156    rgroup that operates on NVECTORS vectors, where 0 <= INDEX < NVECTORS.  */
9157
9158 tree
9159 vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
9160                    unsigned int nvectors, unsigned int index)
9161 {
9162   rgroup_controls *rgl = &(*lens)[nvectors - 1];
9163
9164   /* Populate the rgroup's len array, if this is the first time we've
9165      used it.  */
9166   if (rgl->controls.is_empty ())
9167     {
9168       rgl->controls.safe_grow_cleared (nvectors, true);
9169       for (unsigned int i = 0; i < nvectors; ++i)
9170         {
9171           tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
9172           gcc_assert (len_type != NULL_TREE);
9173           tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
9174
9175           /* Provide a dummy definition until the real one is available.  */
9176           SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
9177           rgl->controls[i] = len;
9178         }
9179     }
9180
9181   return rgl->controls[index];
9182 }
9183
9184 /* Scale profiling counters by estimation for LOOP which is vectorized
9185    by factor VF.  */
9186
9187 static void
9188 scale_profile_for_vect_loop (class loop *loop, unsigned vf)
9189 {
9190   edge preheader = loop_preheader_edge (loop);
9191   /* Reduce loop iterations by the vectorization factor.  */
9192   gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
9193   profile_count freq_h = loop->header->count, freq_e = preheader->count ();
9194
9195   if (freq_h.nonzero_p ())
9196     {
9197       profile_probability p;
9198
9199       /* Avoid dropping loop body profile counter to 0 because of zero count
9200          in loop's preheader.  */
9201       if (!(freq_e == profile_count::zero ()))
9202         freq_e = freq_e.force_nonzero ();
9203       p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
9204       scale_loop_frequencies (loop, p);
9205     }
9206
9207   edge exit_e = single_exit (loop);
9208   exit_e->probability = profile_probability::always ()
9209                                  .apply_scale (1, new_est_niter + 1);
9210
9211   edge exit_l = single_pred_edge (loop->latch);
9212   profile_probability prob = exit_l->probability;
9213   exit_l->probability = exit_e->probability.invert ();
9214   if (prob.initialized_p () && exit_l->probability.initialized_p ())
9215     scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
9216 }
9217
9218 /* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
9219    latch edge values originally defined by it.  */
9220
9221 static void
9222 maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo,
9223                                      stmt_vec_info def_stmt_info)
9224 {
9225   tree def = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
9226   if (!def || TREE_CODE (def) != SSA_NAME)
9227     return;
9228   stmt_vec_info phi_info;
9229   imm_use_iterator iter;
9230   use_operand_p use_p;
9231   FOR_EACH_IMM_USE_FAST (use_p, iter, def)
9232     if (gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p)))
9233       if (gimple_bb (phi)->loop_father->header == gimple_bb (phi)
9234           && (phi_info = loop_vinfo->lookup_stmt (phi))
9235           && STMT_VINFO_RELEVANT_P (phi_info)
9236           && VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info))
9237           && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION
9238           && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION)
9239         {
9240           loop_p loop = gimple_bb (phi)->loop_father;
9241           edge e = loop_latch_edge (loop);
9242           if (PHI_ARG_DEF_FROM_EDGE (phi, e) == def)
9243             {
9244               vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
9245               vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
9246               gcc_assert (phi_defs.length () == latch_defs.length ());
9247               for (unsigned i = 0; i < phi_defs.length (); ++i)
9248                 add_phi_arg (as_a <gphi *> (phi_defs[i]),
9249                              gimple_get_lhs (latch_defs[i]), e,
9250                              gimple_phi_arg_location (phi, e->dest_idx));
9251             }
9252         }
9253 }
9254
9255 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
9256    When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
9257    stmt_vec_info.  */
9258
9259 static bool
9260 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
9261                           gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
9262 {
9263   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9264   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9265
9266   if (dump_enabled_p ())
9267     dump_printf_loc (MSG_NOTE, vect_location,
9268                      "------>vectorizing statement: %G", stmt_info->stmt);
9269
9270   if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
9271     vect_loop_kill_debug_uses (loop, stmt_info);
9272
9273   if (!STMT_VINFO_RELEVANT_P (stmt_info)
9274       && !STMT_VINFO_LIVE_P (stmt_info))
9275     return false;
9276
9277   if (STMT_VINFO_VECTYPE (stmt_info))
9278     {
9279       poly_uint64 nunits
9280         = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
9281       if (!STMT_SLP_TYPE (stmt_info)
9282           && maybe_ne (nunits, vf)
9283           && dump_enabled_p ())
9284         /* For SLP VF is set according to unrolling factor, and not
9285            to vector size, hence for SLP this print is not valid.  */
9286         dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
9287     }
9288
9289   /* Pure SLP statements have already been vectorized.  We still need
9290      to apply loop vectorization to hybrid SLP statements.  */
9291   if (PURE_SLP_STMT (stmt_info))
9292     return false;
9293
9294   if (dump_enabled_p ())
9295     dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
9296
9297   if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL))
9298     *seen_store = stmt_info;
9299
9300   return true;
9301 }
9302
9303 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
9304    in the hash_map with its corresponding values.  */
9305
9306 static tree
9307 find_in_mapping (tree t, void *context)
9308 {
9309   hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
9310
9311   tree *value = mapping->get (t);
9312   return value ? *value : t;
9313 }
9314
9315 /* Update EPILOGUE's loop_vec_info.  EPILOGUE was constructed as a copy of the
9316    original loop that has now been vectorized.
9317
9318    The inits of the data_references need to be advanced with the number of
9319    iterations of the main loop.  This has been computed in vect_do_peeling and
9320    is stored in parameter ADVANCE.  We first restore the data_references
9321    initial offset with the values recored in ORIG_DRS_INIT.
9322
9323    Since the loop_vec_info of this EPILOGUE was constructed for the original
9324    loop, its stmt_vec_infos all point to the original statements.  These need
9325    to be updated to point to their corresponding copies as well as the SSA_NAMES
9326    in their PATTERN_DEF_SEQs and RELATED_STMTs.
9327
9328    The data_reference's connections also need to be updated.  Their
9329    corresponding dr_vec_info need to be reconnected to the EPILOGUE's
9330    stmt_vec_infos, their statements need to point to their corresponding copy,
9331    if they are gather loads or scatter stores then their reference needs to be
9332    updated to point to its corresponding copy and finally we set
9333    'base_misaligned' to false as we have already peeled for alignment in the
9334    prologue of the main loop.  */
9335
9336 static void
9337 update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
9338 {
9339   loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
9340   auto_vec<gimple *> stmt_worklist;
9341   hash_map<tree,tree> mapping;
9342   gimple *orig_stmt, *new_stmt;
9343   gimple_stmt_iterator epilogue_gsi;
9344   gphi_iterator epilogue_phi_gsi;
9345   stmt_vec_info stmt_vinfo = NULL, related_vinfo;
9346   basic_block *epilogue_bbs = get_loop_body (epilogue);
9347   unsigned i;
9348
9349   free (LOOP_VINFO_BBS (epilogue_vinfo));
9350   LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
9351
9352   /* Advance data_reference's with the number of iterations of the previous
9353      loop and its prologue.  */
9354   vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
9355
9356
9357   /* The EPILOGUE loop is a copy of the original loop so they share the same
9358      gimple UIDs.  In this loop we update the loop_vec_info of the EPILOGUE to
9359      point to the copied statements.  We also create a mapping of all LHS' in
9360      the original loop and all the LHS' in the EPILOGUE and create worklists to
9361      update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs.  */
9362   for (unsigned i = 0; i < epilogue->num_nodes; ++i)
9363     {
9364       for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
9365            !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
9366         {
9367           new_stmt = epilogue_phi_gsi.phi ();
9368
9369           gcc_assert (gimple_uid (new_stmt) > 0);
9370           stmt_vinfo
9371             = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
9372
9373           orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
9374           STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
9375
9376           mapping.put (gimple_phi_result (orig_stmt),
9377                        gimple_phi_result (new_stmt));
9378           /* PHI nodes can not have patterns or related statements.  */
9379           gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
9380                       && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
9381         }
9382
9383       for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
9384            !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
9385         {
9386           new_stmt = gsi_stmt (epilogue_gsi);
9387           if (is_gimple_debug (new_stmt))
9388             continue;
9389
9390           gcc_assert (gimple_uid (new_stmt) > 0);
9391           stmt_vinfo
9392             = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
9393
9394           orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
9395           STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
9396
9397           if (tree old_lhs = gimple_get_lhs (orig_stmt))
9398             mapping.put (old_lhs, gimple_get_lhs (new_stmt));
9399
9400           if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
9401             {
9402               gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
9403               for (gimple_stmt_iterator gsi = gsi_start (seq);
9404                    !gsi_end_p (gsi); gsi_next (&gsi))
9405                 stmt_worklist.safe_push (gsi_stmt (gsi));
9406             }
9407
9408           related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
9409           if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
9410             {
9411               gimple *stmt = STMT_VINFO_STMT (related_vinfo);
9412               stmt_worklist.safe_push (stmt);
9413               /* Set BB such that the assert in
9414                 'get_initial_def_for_reduction' is able to determine that
9415                 the BB of the related stmt is inside this loop.  */
9416               gimple_set_bb (stmt,
9417                              gimple_bb (new_stmt));
9418               related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
9419               gcc_assert (related_vinfo == NULL
9420                           || related_vinfo == stmt_vinfo);
9421             }
9422         }
9423     }
9424
9425   /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
9426      using the original main loop and thus need to be updated to refer to the
9427      cloned variables used in the epilogue.  */
9428   for (unsigned i = 0; i < stmt_worklist.length (); ++i)
9429     {
9430       gimple *stmt = stmt_worklist[i];
9431       tree *new_op;
9432
9433       for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
9434         {
9435           tree op = gimple_op (stmt, j);
9436           if ((new_op = mapping.get(op)))
9437             gimple_set_op (stmt, j, *new_op);
9438           else
9439             {
9440               /* PR92429: The last argument of simplify_replace_tree disables
9441                  folding when replacing arguments.  This is required as
9442                  otherwise you might end up with different statements than the
9443                  ones analyzed in vect_loop_analyze, leading to different
9444                  vectorization.  */
9445               op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
9446                                           &find_in_mapping, &mapping, false);
9447               gimple_set_op (stmt, j, op);
9448             }
9449         }
9450     }
9451
9452   struct data_reference *dr;
9453   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
9454   FOR_EACH_VEC_ELT (datarefs, i, dr)
9455     {
9456       orig_stmt = DR_STMT (dr);
9457       gcc_assert (gimple_uid (orig_stmt) > 0);
9458       stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
9459       /* Data references for gather loads and scatter stores do not use the
9460          updated offset we set using ADVANCE.  Instead we have to make sure the
9461          reference in the data references point to the corresponding copy of
9462          the original in the epilogue.  */
9463       if (STMT_VINFO_MEMORY_ACCESS_TYPE (vect_stmt_to_vectorize (stmt_vinfo))
9464           == VMAT_GATHER_SCATTER)
9465         {
9466           DR_REF (dr)
9467             = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
9468                                      &find_in_mapping, &mapping);
9469           DR_BASE_ADDRESS (dr)
9470             = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
9471                                      &find_in_mapping, &mapping);
9472         }
9473       DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
9474       stmt_vinfo->dr_aux.stmt = stmt_vinfo;
9475       /* The vector size of the epilogue is smaller than that of the main loop
9476          so the alignment is either the same or lower. This means the dr will
9477          thus by definition be aligned.  */
9478       STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false;
9479     }
9480
9481   epilogue_vinfo->shared->datarefs_copy.release ();
9482   epilogue_vinfo->shared->save_datarefs ();
9483 }
9484
9485 /* Function vect_transform_loop.
9486
9487    The analysis phase has determined that the loop is vectorizable.
9488    Vectorize the loop - created vectorized stmts to replace the scalar
9489    stmts in the loop, and update the loop exit condition.
9490    Returns scalar epilogue loop if any.  */
9491
9492 class loop *
9493 vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
9494 {
9495   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9496   class loop *epilogue = NULL;
9497   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
9498   int nbbs = loop->num_nodes;
9499   int i;
9500   tree niters_vector = NULL_TREE;
9501   tree step_vector = NULL_TREE;
9502   tree niters_vector_mult_vf = NULL_TREE;
9503   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9504   unsigned int lowest_vf = constant_lower_bound (vf);
9505   gimple *stmt;
9506   bool check_profitability = false;
9507   unsigned int th;
9508
9509   DUMP_VECT_SCOPE ("vec_transform_loop");
9510
9511   loop_vinfo->shared->check_datarefs ();
9512
9513   /* Use the more conservative vectorization threshold.  If the number
9514      of iterations is constant assume the cost check has been performed
9515      by our caller.  If the threshold makes all loops profitable that
9516      run at least the (estimated) vectorization factor number of times
9517      checking is pointless, too.  */
9518   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
9519   if (vect_apply_runtime_profitability_check_p (loop_vinfo))
9520     {
9521       if (dump_enabled_p ())
9522         dump_printf_loc (MSG_NOTE, vect_location,
9523                          "Profitability threshold is %d loop iterations.\n",
9524                          th);
9525       check_profitability = true;
9526     }
9527
9528   /* Make sure there exists a single-predecessor exit bb.  Do this before
9529      versioning.   */
9530   edge e = single_exit (loop);
9531   if (! single_pred_p (e->dest))
9532     {
9533       split_loop_exit_edge (e, true);
9534       if (dump_enabled_p ())
9535         dump_printf (MSG_NOTE, "split exit edge\n");
9536     }
9537
9538   /* Version the loop first, if required, so the profitability check
9539      comes first.  */
9540
9541   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
9542     {
9543       class loop *sloop
9544         = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
9545       sloop->force_vectorize = false;
9546       check_profitability = false;
9547     }
9548
9549   /* Make sure there exists a single-predecessor exit bb also on the
9550      scalar loop copy.  Do this after versioning but before peeling
9551      so CFG structure is fine for both scalar and if-converted loop
9552      to make slpeel_duplicate_current_defs_from_edges face matched
9553      loop closed PHI nodes on the exit.  */
9554   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
9555     {
9556       e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
9557       if (! single_pred_p (e->dest))
9558         {
9559           split_loop_exit_edge (e, true);
9560           if (dump_enabled_p ())
9561             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
9562         }
9563     }
9564
9565   tree niters = vect_build_loop_niters (loop_vinfo);
9566   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
9567   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
9568   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
9569   tree advance;
9570   drs_init_vec orig_drs_init;
9571
9572   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
9573                               &step_vector, &niters_vector_mult_vf, th,
9574                               check_profitability, niters_no_overflow,
9575                               &advance);
9576
9577   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
9578       && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
9579     scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
9580                             LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
9581
9582   if (niters_vector == NULL_TREE)
9583     {
9584       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
9585           && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
9586           && known_eq (lowest_vf, vf))
9587         {
9588           niters_vector
9589             = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
9590                              LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
9591           step_vector = build_one_cst (TREE_TYPE (niters));
9592         }
9593       else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
9594         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
9595                                      &step_vector, niters_no_overflow);
9596       else
9597         /* vect_do_peeling subtracted the number of peeled prologue
9598            iterations from LOOP_VINFO_NITERS.  */
9599         vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
9600                                      &niters_vector, &step_vector,
9601                                      niters_no_overflow);
9602     }
9603
9604   /* 1) Make sure the loop header has exactly two entries
9605      2) Make sure we have a preheader basic block.  */
9606
9607   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
9608
9609   split_edge (loop_preheader_edge (loop));
9610
9611   if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
9612     /* This will deal with any possible peeling.  */
9613     vect_prepare_for_masked_peels (loop_vinfo);
9614
9615   /* Schedule the SLP instances first, then handle loop vectorization
9616      below.  */
9617   if (!loop_vinfo->slp_instances.is_empty ())
9618     {
9619       DUMP_VECT_SCOPE ("scheduling SLP instances");
9620       vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
9621     }
9622
9623   /* FORNOW: the vectorizer supports only loops which body consist
9624      of one basic block (header + empty latch). When the vectorizer will
9625      support more involved loop forms, the order by which the BBs are
9626      traversed need to be reconsidered.  */
9627
9628   for (i = 0; i < nbbs; i++)
9629     {
9630       basic_block bb = bbs[i];
9631       stmt_vec_info stmt_info;
9632
9633       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
9634            gsi_next (&si))
9635         {
9636           gphi *phi = si.phi ();
9637           if (dump_enabled_p ())
9638             dump_printf_loc (MSG_NOTE, vect_location,
9639                              "------>vectorizing phi: %G", phi);
9640           stmt_info = loop_vinfo->lookup_stmt (phi);
9641           if (!stmt_info)
9642             continue;
9643
9644           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
9645             vect_loop_kill_debug_uses (loop, stmt_info);
9646
9647           if (!STMT_VINFO_RELEVANT_P (stmt_info)
9648               && !STMT_VINFO_LIVE_P (stmt_info))
9649             continue;
9650
9651           if (STMT_VINFO_VECTYPE (stmt_info)
9652               && (maybe_ne
9653                   (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
9654               && dump_enabled_p ())
9655             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
9656
9657           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
9658                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
9659                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
9660                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
9661                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
9662               && ! PURE_SLP_STMT (stmt_info))
9663             {
9664               if (dump_enabled_p ())
9665                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
9666               vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL);
9667             }
9668         }
9669
9670       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
9671            gsi_next (&si))
9672         {
9673           gphi *phi = si.phi ();
9674           stmt_info = loop_vinfo->lookup_stmt (phi);
9675           if (!stmt_info)
9676             continue;
9677
9678           if (!STMT_VINFO_RELEVANT_P (stmt_info)
9679               && !STMT_VINFO_LIVE_P (stmt_info))
9680             continue;
9681
9682           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
9683                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
9684                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
9685                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
9686                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
9687               && ! PURE_SLP_STMT (stmt_info))
9688             maybe_set_vectorized_backedge_value (loop_vinfo, stmt_info);
9689         }
9690
9691       for (gimple_stmt_iterator si = gsi_start_bb (bb);
9692            !gsi_end_p (si);)
9693         {
9694           stmt = gsi_stmt (si);
9695           /* During vectorization remove existing clobber stmts.  */
9696           if (gimple_clobber_p (stmt))
9697             {
9698               unlink_stmt_vdef (stmt);
9699               gsi_remove (&si, true);
9700               release_defs (stmt);
9701             }
9702           else
9703             {
9704               /* Ignore vector stmts created in the outer loop.  */
9705               stmt_info = loop_vinfo->lookup_stmt (stmt);
9706
9707               /* vector stmts created in the outer-loop during vectorization of
9708                  stmts in an inner-loop may not have a stmt_info, and do not
9709                  need to be vectorized.  */
9710               stmt_vec_info seen_store = NULL;
9711               if (stmt_info)
9712                 {
9713                   if (STMT_VINFO_IN_PATTERN_P (stmt_info))
9714                     {
9715                       gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
9716                       for (gimple_stmt_iterator subsi = gsi_start (def_seq);
9717                            !gsi_end_p (subsi); gsi_next (&subsi))
9718                         {
9719                           stmt_vec_info pat_stmt_info
9720                             = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
9721                           vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
9722                                                     &si, &seen_store);
9723                         }
9724                       stmt_vec_info pat_stmt_info
9725                         = STMT_VINFO_RELATED_STMT (stmt_info);
9726                       if (vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
9727                                                     &si, &seen_store))
9728                         maybe_set_vectorized_backedge_value (loop_vinfo,
9729                                                              pat_stmt_info);
9730                     }
9731                   else
9732                     {
9733                       if (vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
9734                                                     &seen_store))
9735                         maybe_set_vectorized_backedge_value (loop_vinfo,
9736                                                              stmt_info);
9737                     }
9738                 }
9739               gsi_next (&si);
9740               if (seen_store)
9741                 {
9742                   if (STMT_VINFO_GROUPED_ACCESS (seen_store))
9743                     /* Interleaving.  If IS_STORE is TRUE, the
9744                        vectorization of the interleaving chain was
9745                        completed - free all the stores in the chain.  */
9746                     vect_remove_stores (loop_vinfo,
9747                                         DR_GROUP_FIRST_ELEMENT (seen_store));
9748                   else
9749                     /* Free the attached stmt_vec_info and remove the stmt.  */
9750                     loop_vinfo->remove_stmt (stmt_info);
9751                 }
9752             }
9753         }
9754
9755       /* Stub out scalar statements that must not survive vectorization.
9756          Doing this here helps with grouped statements, or statements that
9757          are involved in patterns.  */
9758       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
9759            !gsi_end_p (gsi); gsi_next (&gsi))
9760         {
9761           gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
9762           if (!call || !gimple_call_internal_p (call))
9763             continue;
9764           internal_fn ifn = gimple_call_internal_fn (call);
9765           if (ifn == IFN_MASK_LOAD)
9766             {
9767               tree lhs = gimple_get_lhs (call);
9768               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9769                 {
9770                   tree zero = build_zero_cst (TREE_TYPE (lhs));
9771                   gimple *new_stmt = gimple_build_assign (lhs, zero);
9772                   gsi_replace (&gsi, new_stmt, true);
9773                 }
9774             }
9775           else if (conditional_internal_fn_code (ifn) != ERROR_MARK)
9776             {
9777               tree lhs = gimple_get_lhs (call);
9778               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9779                 {
9780                   tree else_arg
9781                     = gimple_call_arg (call, gimple_call_num_args (call) - 1);
9782                   gimple *new_stmt = gimple_build_assign (lhs, else_arg);
9783                   gsi_replace (&gsi, new_stmt, true);
9784                 }
9785             }
9786         }
9787     }                           /* BBs in loop */
9788
9789   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
9790      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
9791   if (integer_onep (step_vector))
9792     niters_no_overflow = true;
9793   vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
9794                            niters_vector_mult_vf, !niters_no_overflow);
9795
9796   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
9797   scale_profile_for_vect_loop (loop, assumed_vf);
9798
9799   /* True if the final iteration might not handle a full vector's
9800      worth of scalar iterations.  */
9801   bool final_iter_may_be_partial
9802     = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo);
9803   /* The minimum number of iterations performed by the epilogue.  This
9804      is 1 when peeling for gaps because we always need a final scalar
9805      iteration.  */
9806   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
9807   /* +1 to convert latch counts to loop iteration counts,
9808      -min_epilogue_iters to remove iterations that cannot be performed
9809        by the vector code.  */
9810   int bias_for_lowest = 1 - min_epilogue_iters;
9811   int bias_for_assumed = bias_for_lowest;
9812   int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
9813   if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
9814     {
9815       /* When the amount of peeling is known at compile time, the first
9816          iteration will have exactly alignment_npeels active elements.
9817          In the worst case it will have at least one.  */
9818       int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
9819       bias_for_lowest += lowest_vf - min_first_active;
9820       bias_for_assumed += assumed_vf - min_first_active;
9821     }
9822   /* In these calculations the "- 1" converts loop iteration counts
9823      back to latch counts.  */
9824   if (loop->any_upper_bound)
9825     {
9826       loop_vec_info main_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
9827       loop->nb_iterations_upper_bound
9828         = (final_iter_may_be_partial
9829            ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
9830                             lowest_vf) - 1
9831            : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
9832                              lowest_vf) - 1);
9833       if (main_vinfo)
9834         {
9835           unsigned int bound;
9836           poly_uint64 main_iters
9837             = upper_bound (LOOP_VINFO_VECT_FACTOR (main_vinfo),
9838                            LOOP_VINFO_COST_MODEL_THRESHOLD (main_vinfo));
9839           main_iters
9840             = upper_bound (main_iters,
9841                            LOOP_VINFO_VERSIONING_THRESHOLD (main_vinfo));
9842           if (can_div_away_from_zero_p (main_iters,
9843                                         LOOP_VINFO_VECT_FACTOR (loop_vinfo),
9844                                         &bound))
9845             loop->nb_iterations_upper_bound
9846               = wi::umin ((widest_int) (bound - 1),
9847                           loop->nb_iterations_upper_bound);
9848       }
9849   }
9850   if (loop->any_likely_upper_bound)
9851     loop->nb_iterations_likely_upper_bound
9852       = (final_iter_may_be_partial
9853          ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
9854                           + bias_for_lowest, lowest_vf) - 1
9855          : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
9856                            + bias_for_lowest, lowest_vf) - 1);
9857   if (loop->any_estimate)
9858     loop->nb_iterations_estimate
9859       = (final_iter_may_be_partial
9860          ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
9861                           assumed_vf) - 1
9862          : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
9863                            assumed_vf) - 1);
9864
9865   if (dump_enabled_p ())
9866     {
9867       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
9868         {
9869           dump_printf_loc (MSG_NOTE, vect_location,
9870                            "LOOP VECTORIZED\n");
9871           if (loop->inner)
9872             dump_printf_loc (MSG_NOTE, vect_location,
9873                              "OUTER LOOP VECTORIZED\n");
9874           dump_printf (MSG_NOTE, "\n");
9875         }
9876       else
9877         dump_printf_loc (MSG_NOTE, vect_location,
9878                          "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
9879                          GET_MODE_NAME (loop_vinfo->vector_mode));
9880     }
9881
9882   /* Loops vectorized with a variable factor won't benefit from
9883      unrolling/peeling.  */
9884   if (!vf.is_constant ())
9885     {
9886       loop->unroll = 1;
9887       if (dump_enabled_p ())
9888         dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
9889                          " variable-length vectorization factor\n");
9890     }
9891   /* Free SLP instances here because otherwise stmt reference counting
9892      won't work.  */
9893   slp_instance instance;
9894   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
9895     vect_free_slp_instance (instance);
9896   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
9897   /* Clear-up safelen field since its value is invalid after vectorization
9898      since vectorized loop can have loop-carried dependencies.  */
9899   loop->safelen = 0;
9900
9901   if (epilogue)
9902     {
9903       update_epilogue_loop_vinfo (epilogue, advance);
9904
9905       epilogue->simduid = loop->simduid;
9906       epilogue->force_vectorize = loop->force_vectorize;
9907       epilogue->dont_vectorize = false;
9908     }
9909
9910   return epilogue;
9911 }
9912
9913 /* The code below is trying to perform simple optimization - revert
9914    if-conversion for masked stores, i.e. if the mask of a store is zero
9915    do not perform it and all stored value producers also if possible.
9916    For example,
9917      for (i=0; i<n; i++)
9918        if (c[i])
9919         {
9920           p1[i] += 1;
9921           p2[i] = p3[i] +2;
9922         }
9923    this transformation will produce the following semi-hammock:
9924
9925    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
9926      {
9927        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
9928        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
9929        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
9930        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
9931        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
9932        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
9933      }
9934 */
9935
9936 void
9937 optimize_mask_stores (class loop *loop)
9938 {
9939   basic_block *bbs = get_loop_body (loop);
9940   unsigned nbbs = loop->num_nodes;
9941   unsigned i;
9942   basic_block bb;
9943   class loop *bb_loop;
9944   gimple_stmt_iterator gsi;
9945   gimple *stmt;
9946   auto_vec<gimple *> worklist;
9947   auto_purge_vect_location sentinel;
9948
9949   vect_location = find_loop_location (loop);
9950   /* Pick up all masked stores in loop if any.  */
9951   for (i = 0; i < nbbs; i++)
9952     {
9953       bb = bbs[i];
9954       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
9955            gsi_next (&gsi))
9956         {
9957           stmt = gsi_stmt (gsi);
9958           if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
9959             worklist.safe_push (stmt);
9960         }
9961     }
9962
9963   free (bbs);
9964   if (worklist.is_empty ())
9965     return;
9966
9967   /* Loop has masked stores.  */
9968   while (!worklist.is_empty ())
9969     {
9970       gimple *last, *last_store;
9971       edge e, efalse;
9972       tree mask;
9973       basic_block store_bb, join_bb;
9974       gimple_stmt_iterator gsi_to;
9975       tree vdef, new_vdef;
9976       gphi *phi;
9977       tree vectype;
9978       tree zero;
9979
9980       last = worklist.pop ();
9981       mask = gimple_call_arg (last, 2);
9982       bb = gimple_bb (last);
9983       /* Create then_bb and if-then structure in CFG, then_bb belongs to
9984          the same loop as if_bb.  It could be different to LOOP when two
9985          level loop-nest is vectorized and mask_store belongs to the inner
9986          one.  */
9987       e = split_block (bb, last);
9988       bb_loop = bb->loop_father;
9989       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
9990       join_bb = e->dest;
9991       store_bb = create_empty_bb (bb);
9992       add_bb_to_loop (store_bb, bb_loop);
9993       e->flags = EDGE_TRUE_VALUE;
9994       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
9995       /* Put STORE_BB to likely part.  */
9996       efalse->probability = profile_probability::unlikely ();
9997       store_bb->count = efalse->count ();
9998       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
9999       if (dom_info_available_p (CDI_DOMINATORS))
10000         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
10001       if (dump_enabled_p ())
10002         dump_printf_loc (MSG_NOTE, vect_location,
10003                          "Create new block %d to sink mask stores.",
10004                          store_bb->index);
10005       /* Create vector comparison with boolean result.  */
10006       vectype = TREE_TYPE (mask);
10007       zero = build_zero_cst (vectype);
10008       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
10009       gsi = gsi_last_bb (bb);
10010       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
10011       /* Create new PHI node for vdef of the last masked store:
10012          .MEM_2 = VDEF <.MEM_1>
10013          will be converted to
10014          .MEM.3 = VDEF <.MEM_1>
10015          and new PHI node will be created in join bb
10016          .MEM_2 = PHI <.MEM_1, .MEM_3>
10017       */
10018       vdef = gimple_vdef (last);
10019       new_vdef = make_ssa_name (gimple_vop (cfun), last);
10020       gimple_set_vdef (last, new_vdef);
10021       phi = create_phi_node (vdef, join_bb);
10022       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
10023
10024       /* Put all masked stores with the same mask to STORE_BB if possible.  */
10025       while (true)
10026         {
10027           gimple_stmt_iterator gsi_from;
10028           gimple *stmt1 = NULL;
10029
10030           /* Move masked store to STORE_BB.  */
10031           last_store = last;
10032           gsi = gsi_for_stmt (last);
10033           gsi_from = gsi;
10034           /* Shift GSI to the previous stmt for further traversal.  */
10035           gsi_prev (&gsi);
10036           gsi_to = gsi_start_bb (store_bb);
10037           gsi_move_before (&gsi_from, &gsi_to);
10038           /* Setup GSI_TO to the non-empty block start.  */
10039           gsi_to = gsi_start_bb (store_bb);
10040           if (dump_enabled_p ())
10041             dump_printf_loc (MSG_NOTE, vect_location,
10042                              "Move stmt to created bb\n%G", last);
10043           /* Move all stored value producers if possible.  */
10044           while (!gsi_end_p (gsi))
10045             {
10046               tree lhs;
10047               imm_use_iterator imm_iter;
10048               use_operand_p use_p;
10049               bool res;
10050
10051               /* Skip debug statements.  */
10052               if (is_gimple_debug (gsi_stmt (gsi)))
10053                 {
10054                   gsi_prev (&gsi);
10055                   continue;
10056                 }
10057               stmt1 = gsi_stmt (gsi);
10058               /* Do not consider statements writing to memory or having
10059                  volatile operand.  */
10060               if (gimple_vdef (stmt1)
10061                   || gimple_has_volatile_ops (stmt1))
10062                 break;
10063               gsi_from = gsi;
10064               gsi_prev (&gsi);
10065               lhs = gimple_get_lhs (stmt1);
10066               if (!lhs)
10067                 break;
10068
10069               /* LHS of vectorized stmt must be SSA_NAME.  */
10070               if (TREE_CODE (lhs) != SSA_NAME)
10071                 break;
10072
10073               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
10074                 {
10075                   /* Remove dead scalar statement.  */
10076                   if (has_zero_uses (lhs))
10077                     {
10078                       gsi_remove (&gsi_from, true);
10079                       continue;
10080                     }
10081                 }
10082
10083               /* Check that LHS does not have uses outside of STORE_BB.  */
10084               res = true;
10085               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
10086                 {
10087                   gimple *use_stmt;
10088                   use_stmt = USE_STMT (use_p);
10089                   if (is_gimple_debug (use_stmt))
10090                     continue;
10091                   if (gimple_bb (use_stmt) != store_bb)
10092                     {
10093                       res = false;
10094                       break;
10095                     }
10096                 }
10097               if (!res)
10098                 break;
10099
10100               if (gimple_vuse (stmt1)
10101                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
10102                 break;
10103
10104               /* Can move STMT1 to STORE_BB.  */
10105               if (dump_enabled_p ())
10106                 dump_printf_loc (MSG_NOTE, vect_location,
10107                                  "Move stmt to created bb\n%G", stmt1);
10108               gsi_move_before (&gsi_from, &gsi_to);
10109               /* Shift GSI_TO for further insertion.  */
10110               gsi_prev (&gsi_to);
10111             }
10112           /* Put other masked stores with the same mask to STORE_BB.  */
10113           if (worklist.is_empty ()
10114               || gimple_call_arg (worklist.last (), 2) != mask
10115               || worklist.last () != stmt1)
10116             break;
10117           last = worklist.pop ();
10118         }
10119       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
10120     }
10121 }
10122
10123 /* Decide whether it is possible to use a zero-based induction variable
10124    when vectorizing LOOP_VINFO with partial vectors.  If it is, return
10125    the value that the induction variable must be able to hold in order
10126    to ensure that the rgroups eventually have no active vector elements.
10127    Return -1 otherwise.  */
10128
10129 widest_int
10130 vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
10131 {
10132   tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
10133   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10134   unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
10135
10136   /* Calculate the value that the induction variable must be able
10137      to hit in order to ensure that we end the loop with an all-false mask.
10138      This involves adding the maximum number of inactive trailing scalar
10139      iterations.  */
10140   widest_int iv_limit = -1;
10141   if (max_loop_iterations (loop, &iv_limit))
10142     {
10143       if (niters_skip)
10144         {
10145           /* Add the maximum number of skipped iterations to the
10146              maximum iteration count.  */
10147           if (TREE_CODE (niters_skip) == INTEGER_CST)
10148             iv_limit += wi::to_widest (niters_skip);
10149           else
10150             iv_limit += max_vf - 1;
10151         }
10152       else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
10153         /* Make a conservatively-correct assumption.  */
10154         iv_limit += max_vf - 1;
10155
10156       /* IV_LIMIT is the maximum number of latch iterations, which is also
10157          the maximum in-range IV value.  Round this value down to the previous
10158          vector alignment boundary and then add an extra full iteration.  */
10159       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
10160       iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
10161     }
10162   return iv_limit;
10163 }
10164
10165 /* For the given rgroup_controls RGC, check whether an induction variable
10166    would ever hit a value that produces a set of all-false masks or zero
10167    lengths before wrapping around.  Return true if it's possible to wrap
10168    around before hitting the desirable value, otherwise return false.  */
10169
10170 bool
10171 vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
10172 {
10173   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
10174
10175   if (iv_limit == -1)
10176     return true;
10177
10178   tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
10179   unsigned int compare_precision = TYPE_PRECISION (compare_type);
10180   unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
10181
10182   if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
10183     return true;
10184
10185   return false;
10186 }