gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2020 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "cfghooks.h"
  31 #include "tree-pass.h"
  32 #include "ssa.h"
  33 #include "optabs-tree.h"
  34 #include "diagnostic-core.h"
  35 #include "fold-const.h"
  36 #include "stor-layout.h"
  37 #include "cfganal.h"
  38 #include "gimplify.h"
  39 #include "gimple-iterator.h"
  40 #include "gimplify-me.h"
  41 #include "tree-ssa-loop-ivopts.h"
  42 #include "tree-ssa-loop-manip.h"
  43 #include "tree-ssa-loop-niter.h"
  44 #include "tree-ssa-loop.h"
  45 #include "cfgloop.h"
  46 #include "tree-scalar-evolution.h"
  47 #include "tree-vectorizer.h"
  48 #include "gimple-fold.h"
  49 #include "cgraph.h"
  50 #include "tree-cfg.h"
  51 #include "tree-if-conv.h"
  52 #include "internal-fn.h"
  53 #include "tree-vector-builder.h"
  54 #include "vec-perm-indices.h"
  55 #include "tree-eh.h"
  56
  57 /* Loop Vectorization Pass.
  58
  59    This pass tries to vectorize loops.
  60
  61    For example, the vectorizer transforms the following simple loop:
  62
  63         short a[N]; short b[N]; short c[N]; int i;
  64
  65         for (i=0; i<N; i++){
  66           a[i] = b[i] + c[i];
  67         }
  68
  69    as if it was manually vectorized by rewriting the source code into:
  70
  71         typedef int __attribute__((mode(V8HI))) v8hi;
  72         short a[N];  short b[N]; short c[N];   int i;
  73         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  74         v8hi va, vb, vc;
  75
  76         for (i=0; i<N/8; i++){
  77           vb = pb[i];
  78           vc = pc[i];
  79           va = vb + vc;
  80           pa[i] = va;
  81         }
  82
  83         The main entry to this pass is vectorize_loops(), in which
  84    the vectorizer applies a set of analyses on a given set of loops,
  85    followed by the actual vectorization transformation for the loops that
  86    had successfully passed the analysis phase.
  87         Throughout this pass we make a distinction between two types of
  88    data: scalars (which are represented by SSA_NAMES), and memory references
  89    ("data-refs").  These two types of data require different handling both
  90    during analysis and transformation. The types of data-refs that the
  91    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  92    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  93    accesses are required to have a simple (consecutive) access pattern.
  94
  95    Analysis phase:
  96    ===============
  97         The driver for the analysis phase is vect_analyze_loop().
  98    It applies a set of analyses, some of which rely on the scalar evolution
  99    analyzer (scev) developed by Sebastian Pop.
 100
 101         During the analysis phase the vectorizer records some information
 102    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 103    loop, as well as general information about the loop as a whole, which is
 104    recorded in a "loop_vec_info" struct attached to each loop.
 105
 106    Transformation phase:
 107    =====================
 108         The loop transformation phase scans all the stmts in the loop, and
 109    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 110    the loop that needs to be vectorized.  It inserts the vector code sequence
 111    just before the scalar stmt S, and records a pointer to the vector code
 112    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 113    attached to S).  This pointer will be used for the vectorization of following
 114    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 115    otherwise, we rely on dead code elimination for removing it.
 116
 117         For example, say stmt S1 was vectorized into stmt VS1:
 118
 119    VS1: vb = px[i];
 120    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 121    S2:  a = b;
 122
 123    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 124    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 125    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 126    resulting sequence would be:
 127
 128    VS1: vb = px[i];
 129    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 130    VS2: va = vb;
 131    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 132
 133         Operands that are not SSA_NAMEs, are data-refs that appear in
 134    load/store operations (like 'x[i]' in S1), and are handled differently.
 135
 136    Target modeling:
 137    =================
 138         Currently the only target specific information that is used is the
 139    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 140    Targets that can support different sizes of vectors, for now will need
 141    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 142    flexibility will be added in the future.
 143
 144         Since we only vectorize operations which vector form can be
 145    expressed using existing tree codes, to verify that an operation is
 146    supported, the vectorizer checks the relevant optab at the relevant
 147    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 148    the value found is CODE_FOR_nothing, then there's no target support, and
 149    we can't vectorize the stmt.
 150
 151    For additional information on this project see:
 152    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 153 */
 154
 155 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 156 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
 157                                                bool *, bool *);
 158
 159 /* Subroutine of vect_determine_vf_for_stmt that handles only one
 160    statement.  VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
 161    may already be set for general statements (not just data refs).  */
 162
 163 static opt_result
 164 vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
 165                               bool vectype_maybe_set_p,
 166                               poly_uint64 *vf)
 167 {
 168   gimple *stmt = stmt_info->stmt;
 169
 170   if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 171        && !STMT_VINFO_LIVE_P (stmt_info))
 172       || gimple_clobber_p (stmt))
 173     {
 174       if (dump_enabled_p ())
 175         dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 176       return opt_result::success ();
 177     }
 178
 179   tree stmt_vectype, nunits_vectype;
 180   opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info,
 181                                                    &stmt_vectype,
 182                                                    &nunits_vectype);
 183   if (!res)
 184     return res;
 185
 186   if (stmt_vectype)
 187     {
 188       if (STMT_VINFO_VECTYPE (stmt_info))
 189         /* The only case when a vectype had been already set is for stmts
 190            that contain a data ref, or for "pattern-stmts" (stmts generated
 191            by the vectorizer to represent/replace a certain idiom).  */
 192         gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
 193                      || vectype_maybe_set_p)
 194                     && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
 195       else
 196         STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
 197     }
 198
 199   if (nunits_vectype)
 200     vect_update_max_nunits (vf, nunits_vectype);
 201
 202   return opt_result::success ();
 203 }
 204
 205 /* Subroutine of vect_determine_vectorization_factor.  Set the vector
 206    types of STMT_INFO and all attached pattern statements and update
 207    the vectorization factor VF accordingly.  Return true on success
 208    or false if something prevented vectorization.  */
 209
 210 static opt_result
 211 vect_determine_vf_for_stmt (vec_info *vinfo,
 212                             stmt_vec_info stmt_info, poly_uint64 *vf)
 213 {
 214   if (dump_enabled_p ())
 215     dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
 216                      stmt_info->stmt);
 217   opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, false, vf);
 218   if (!res)
 219     return res;
 220
 221   if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 222       && STMT_VINFO_RELATED_STMT (stmt_info))
 223     {
 224       gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 225       stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
 226
 227       /* If a pattern statement has def stmts, analyze them too.  */
 228       for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
 229            !gsi_end_p (si); gsi_next (&si))
 230         {
 231           stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
 232           if (dump_enabled_p ())
 233             dump_printf_loc (MSG_NOTE, vect_location,
 234                              "==> examining pattern def stmt: %G",
 235                              def_stmt_info->stmt);
 236           res = vect_determine_vf_for_stmt_1 (vinfo, def_stmt_info, true, vf);
 237           if (!res)
 238             return res;
 239         }
 240
 241       if (dump_enabled_p ())
 242         dump_printf_loc (MSG_NOTE, vect_location,
 243                          "==> examining pattern statement: %G",
 244                          stmt_info->stmt);
 245       res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, true, vf);
 246       if (!res)
 247         return res;
 248     }
 249
 250   return opt_result::success ();
 251 }
 252
 253 /* Function vect_determine_vectorization_factor
 254
 255    Determine the vectorization factor (VF).  VF is the number of data elements
 256    that are operated upon in parallel in a single iteration of the vectorized
 257    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 258    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 259    elements can fit in a single vector register.
 260
 261    We currently support vectorization of loops in which all types operated upon
 262    are of the same size.  Therefore this function currently sets VF according to
 263    the size of the types operated upon, and fails if there are multiple sizes
 264    in the loop.
 265
 266    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 267    original loop:
 268         for (i=0; i<N; i++){
 269           a[i] = b[i] + c[i];
 270         }
 271
 272    vectorized loop:
 273         for (i=0; i<N; i+=VF){
 274           a[i:VF] = b[i:VF] + c[i:VF];
 275         }
 276 */
 277
 278 static opt_result
 279 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 280 {
 281   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 282   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 283   unsigned nbbs = loop->num_nodes;
 284   poly_uint64 vectorization_factor = 1;
 285   tree scalar_type = NULL_TREE;
 286   gphi *phi;
 287   tree vectype;
 288   stmt_vec_info stmt_info;
 289   unsigned i;
 290
 291   DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
 292
 293   for (i = 0; i < nbbs; i++)
 294     {
 295       basic_block bb = bbs[i];
 296
 297       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 298            gsi_next (&si))
 299         {
 300           phi = si.phi ();
 301           stmt_info = loop_vinfo->lookup_stmt (phi);
 302           if (dump_enabled_p ())
 303             dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
 304                              phi);
 305
 306           gcc_assert (stmt_info);
 307
 308           if (STMT_VINFO_RELEVANT_P (stmt_info)
 309               || STMT_VINFO_LIVE_P (stmt_info))
 310             {
 311               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 312               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 313
 314               if (dump_enabled_p ())
 315                 dump_printf_loc (MSG_NOTE, vect_location,
 316                                  "get vectype for scalar type:  %T\n",
 317                                  scalar_type);
 318
 319               vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
 320               if (!vectype)
 321                 return opt_result::failure_at (phi,
 322                                                "not vectorized: unsupported "
 323                                                "data-type %T\n",
 324                                                scalar_type);
 325               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 326
 327               if (dump_enabled_p ())
 328                 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
 329                                  vectype);
 330
 331               if (dump_enabled_p ())
 332                 {
 333                   dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
 334                   dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
 335                   dump_printf (MSG_NOTE, "\n");
 336                 }
 337
 338               vect_update_max_nunits (&vectorization_factor, vectype);
 339             }
 340         }
 341
 342       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
 343            gsi_next (&si))
 344         {
 345           stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
 346           opt_result res
 347             = vect_determine_vf_for_stmt (loop_vinfo,
 348                                           stmt_info, &vectorization_factor);
 349           if (!res)
 350             return res;
 351         }
 352     }
 353
 354   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 355   if (dump_enabled_p ())
 356     {
 357       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
 358       dump_dec (MSG_NOTE, vectorization_factor);
 359       dump_printf (MSG_NOTE, "\n");
 360     }
 361
 362   if (known_le (vectorization_factor, 1U))
 363     return opt_result::failure_at (vect_location,
 364                                    "not vectorized: unsupported data-type\n");
 365   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 366   return opt_result::success ();
 367 }
 368
 369
 370 /* Function vect_is_simple_iv_evolution.
 371
 372    FORNOW: A simple evolution of an induction variables in the loop is
 373    considered a polynomial evolution.  */
 374
 375 static bool
 376 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 377                              tree * step)
 378 {
 379   tree init_expr;
 380   tree step_expr;
 381   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 382   basic_block bb;
 383
 384   /* When there is no evolution in this loop, the evolution function
 385      is not "simple".  */
 386   if (evolution_part == NULL_TREE)
 387     return false;
 388
 389   /* When the evolution is a polynomial of degree >= 2
 390      the evolution function is not "simple".  */
 391   if (tree_is_chrec (evolution_part))
 392     return false;
 393
 394   step_expr = evolution_part;
 395   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 396
 397   if (dump_enabled_p ())
 398     dump_printf_loc (MSG_NOTE, vect_location, "step: %T,  init: %T\n",
 399                      step_expr, init_expr);
 400
 401   *init = init_expr;
 402   *step = step_expr;
 403
 404   if (TREE_CODE (step_expr) != INTEGER_CST
 405       && (TREE_CODE (step_expr) != SSA_NAME
 406           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 407               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 408           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 409               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 410                   || !flag_associative_math)))
 411       && (TREE_CODE (step_expr) != REAL_CST
 412           || !flag_associative_math))
 413     {
 414       if (dump_enabled_p ())
 415         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 416                          "step unknown.\n");
 417       return false;
 418     }
 419
 420   return true;
 421 }
 422
 423 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
 424    what we are assuming is a double reduction.  For example, given
 425    a structure like this:
 426
 427       outer1:
 428         x_1 = PHI <x_4(outer2), ...>;
 429         ...
 430
 431       inner:
 432         x_2 = PHI <x_1(outer1), ...>;
 433         ...
 434         x_3 = ...;
 435         ...
 436
 437       outer2:
 438         x_4 = PHI <x_3(inner)>;
 439         ...
 440
 441    outer loop analysis would treat x_1 as a double reduction phi and
 442    this function would then return true for x_2.  */
 443
 444 static bool
 445 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
 446 {
 447   use_operand_p use_p;
 448   ssa_op_iter op_iter;
 449   FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
 450     if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
 451       if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
 452         return true;
 453   return false;
 454 }
 455
 456 /* Function vect_analyze_scalar_cycles_1.
 457
 458    Examine the cross iteration def-use cycles of scalar variables
 459    in LOOP.  LOOP_VINFO represents the loop that is now being
 460    considered for vectorization (can be LOOP, or an outer-loop
 461    enclosing LOOP).  */
 462
 463 static void
 464 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop)
 465 {
 466   basic_block bb = loop->header;
 467   tree init, step;
 468   auto_vec<stmt_vec_info, 64> worklist;
 469   gphi_iterator gsi;
 470   bool double_reduc, reduc_chain;
 471
 472   DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
 473
 474   /* First - identify all inductions.  Reduction detection assumes that all the
 475      inductions have been identified, therefore, this order must not be
 476      changed.  */
 477   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 478     {
 479       gphi *phi = gsi.phi ();
 480       tree access_fn = NULL;
 481       tree def = PHI_RESULT (phi);
 482       stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
 483
 484       if (dump_enabled_p ())
 485         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
 486
 487       /* Skip virtual phi's.  The data dependences that are associated with
 488          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 489       if (virtual_operand_p (def))
 490         continue;
 491
 492       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 493
 494       /* Analyze the evolution function.  */
 495       access_fn = analyze_scalar_evolution (loop, def);
 496       if (access_fn)
 497         {
 498           STRIP_NOPS (access_fn);
 499           if (dump_enabled_p ())
 500             dump_printf_loc (MSG_NOTE, vect_location,
 501                              "Access function of PHI: %T\n", access_fn);
 502           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 503             = initial_condition_in_loop_num (access_fn, loop->num);
 504           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 505             = evolution_part_in_loop_num (access_fn, loop->num);
 506         }
 507
 508       if (!access_fn
 509           || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
 510           || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
 511           || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 512               && TREE_CODE (step) != INTEGER_CST))
 513         {
 514           worklist.safe_push (stmt_vinfo);
 515           continue;
 516         }
 517
 518       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 519                   != NULL_TREE);
 520       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 521
 522       if (dump_enabled_p ())
 523         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 524       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 525     }
 526
 527
 528   /* Second - identify all reductions and nested cycles.  */
 529   while (worklist.length () > 0)
 530     {
 531       stmt_vec_info stmt_vinfo = worklist.pop ();
 532       gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
 533       tree def = PHI_RESULT (phi);
 534
 535       if (dump_enabled_p ())
 536         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
 537
 538       gcc_assert (!virtual_operand_p (def)
 539                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 540
 541       stmt_vec_info reduc_stmt_info
 542         = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
 543                                     &reduc_chain);
 544       if (reduc_stmt_info)
 545         {
 546           STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
 547           STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
 548           if (double_reduc)
 549             {
 550               if (dump_enabled_p ())
 551                 dump_printf_loc (MSG_NOTE, vect_location,
 552                                  "Detected double reduction.\n");
 553
 554               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 555               STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
 556             }
 557           else
 558             {
 559               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
 560                 {
 561                   if (dump_enabled_p ())
 562                     dump_printf_loc (MSG_NOTE, vect_location,
 563                                      "Detected vectorizable nested cycle.\n");
 564
 565                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 566                 }
 567               else
 568                 {
 569                   if (dump_enabled_p ())
 570                     dump_printf_loc (MSG_NOTE, vect_location,
 571                                      "Detected reduction.\n");
 572
 573                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 574                   STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
 575                   /* Store the reduction cycles for possible vectorization in
 576                      loop-aware SLP if it was not detected as reduction
 577                      chain.  */
 578                   if (! reduc_chain)
 579                     LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
 580                       (reduc_stmt_info);
 581                 }
 582             }
 583         }
 584       else
 585         if (dump_enabled_p ())
 586           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 587                            "Unknown def-use cycle pattern.\n");
 588     }
 589 }
 590
 591
 592 /* Function vect_analyze_scalar_cycles.
 593
 594    Examine the cross iteration def-use cycles of scalar variables, by
 595    analyzing the loop-header PHIs of scalar variables.  Classify each
 596    cycle as one of the following: invariant, induction, reduction, unknown.
 597    We do that for the loop represented by LOOP_VINFO, and also to its
 598    inner-loop, if exists.
 599    Examples for scalar cycles:
 600
 601    Example1: reduction:
 602
 603               loop1:
 604               for (i=0; i<N; i++)
 605                  sum += a[i];
 606
 607    Example2: induction:
 608
 609               loop2:
 610               for (i=0; i<N; i++)
 611                  a[i] = i;  */
 612
 613 static void
 614 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 615 {
 616   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 617
 618   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 619
 620   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 621      Reductions in such inner-loop therefore have different properties than
 622      the reductions in the nest that gets vectorized:
 623      1. When vectorized, they are executed in the same order as in the original
 624         scalar loop, so we can't change the order of computation when
 625         vectorizing them.
 626      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 627         current checks are too strict.  */
 628
 629   if (loop->inner)
 630     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 631 }
 632
 633 /* Transfer group and reduction information from STMT_INFO to its
 634    pattern stmt.  */
 635
 636 static void
 637 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
 638 {
 639   stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
 640   stmt_vec_info stmtp;
 641   gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
 642               && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
 643   REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
 644   do
 645     {
 646       stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
 647       gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
 648                            == STMT_VINFO_DEF_TYPE (stmt_info));
 649       REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
 650       stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
 651       if (stmt_info)
 652         REDUC_GROUP_NEXT_ELEMENT (stmtp)
 653           = STMT_VINFO_RELATED_STMT (stmt_info);
 654     }
 655   while (stmt_info);
 656 }
 657
 658 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 659
 660 static void
 661 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 662 {
 663   stmt_vec_info first;
 664   unsigned i;
 665
 666   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 667     if (STMT_VINFO_IN_PATTERN_P (first))
 668       {
 669         stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
 670         while (next)
 671           {
 672             if (! STMT_VINFO_IN_PATTERN_P (next)
 673                 || STMT_VINFO_REDUC_IDX (STMT_VINFO_RELATED_STMT (next)) == -1)
 674               break;
 675             next = REDUC_GROUP_NEXT_ELEMENT (next);
 676           }
 677         /* If not all stmt in the chain are patterns or if we failed
 678            to update STMT_VINFO_REDUC_IDX try to handle the chain
 679            without patterns.  */
 680         if (! next
 681             && STMT_VINFO_REDUC_IDX (STMT_VINFO_RELATED_STMT (first)) != -1)
 682           {
 683             vect_fixup_reduc_chain (first);
 684             LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
 685               = STMT_VINFO_RELATED_STMT (first);
 686           }
 687       }
 688 }
 689
 690 /* Function vect_get_loop_niters.
 691
 692    Determine how many iterations the loop is executed and place it
 693    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
 694    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
 695    niter information holds in ASSUMPTIONS.
 696
 697    Return the loop exit condition.  */
 698
 699
 700 static gcond *
 701 vect_get_loop_niters (class loop *loop, tree *assumptions,
 702                       tree *number_of_iterations, tree *number_of_iterationsm1)
 703 {
 704   edge exit = single_exit (loop);
 705   class tree_niter_desc niter_desc;
 706   tree niter_assumptions, niter, may_be_zero;
 707   gcond *cond = get_loop_exit_condition (loop);
 708
 709   *assumptions = boolean_true_node;
 710   *number_of_iterationsm1 = chrec_dont_know;
 711   *number_of_iterations = chrec_dont_know;
 712   DUMP_VECT_SCOPE ("get_loop_niters");
 713
 714   if (!exit)
 715     return cond;
 716
 717   may_be_zero = NULL_TREE;
 718   if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
 719       || chrec_contains_undetermined (niter_desc.niter))
 720     return cond;
 721
 722   niter_assumptions = niter_desc.assumptions;
 723   may_be_zero = niter_desc.may_be_zero;
 724   niter = niter_desc.niter;
 725
 726   if (may_be_zero && integer_zerop (may_be_zero))
 727     may_be_zero = NULL_TREE;
 728
 729   if (may_be_zero)
 730     {
 731       if (COMPARISON_CLASS_P (may_be_zero))
 732         {
 733           /* Try to combine may_be_zero with assumptions, this can simplify
 734              computation of niter expression.  */
 735           if (niter_assumptions && !integer_nonzerop (niter_assumptions))
 736             niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
 737                                              niter_assumptions,
 738                                              fold_build1 (TRUTH_NOT_EXPR,
 739                                                           boolean_type_node,
 740                                                           may_be_zero));
 741           else
 742             niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
 743                                  build_int_cst (TREE_TYPE (niter), 0),
 744                                  rewrite_to_non_trapping_overflow (niter));
 745
 746           may_be_zero = NULL_TREE;
 747         }
 748       else if (integer_nonzerop (may_be_zero))
 749         {
 750           *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
 751           *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
 752           return cond;
 753         }
 754       else
 755         return cond;
 756     }
 757
 758   *assumptions = niter_assumptions;
 759   *number_of_iterationsm1 = niter;
 760
 761   /* We want the number of loop header executions which is the number
 762      of latch executions plus one.
 763      ???  For UINT_MAX latch executions this number overflows to zero
 764      for loops like do { n++; } while (n != 0);  */
 765   if (niter && !chrec_contains_undetermined (niter))
 766     niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
 767                           build_int_cst (TREE_TYPE (niter), 1));
 768   *number_of_iterations = niter;
 769
 770   return cond;
 771 }
 772
 773 /* Function bb_in_loop_p
 774
 775    Used as predicate for dfs order traversal of the loop bbs.  */
 776
 777 static bool
 778 bb_in_loop_p (const_basic_block bb, const void *data)
 779 {
 780   const class loop *const loop = (const class loop *)data;
 781   if (flow_bb_inside_loop_p (loop, bb))
 782     return true;
 783   return false;
 784 }
 785
 786
 787 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
 788    stmt_vec_info structs for all the stmts in LOOP_IN.  */
 789
 790 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
 791   : vec_info (vec_info::loop, init_cost (loop_in), shared),
 792     loop (loop_in),
 793     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
 794     num_itersm1 (NULL_TREE),
 795     num_iters (NULL_TREE),
 796     num_iters_unchanged (NULL_TREE),
 797     num_iters_assumptions (NULL_TREE),
 798     th (0),
 799     versioning_threshold (0),
 800     vectorization_factor (0),
 801     max_vectorization_factor (0),
 802     mask_skip_niters (NULL_TREE),
 803     mask_compare_type (NULL_TREE),
 804     simd_if_cond (NULL_TREE),
 805     unaligned_dr (NULL),
 806     peeling_for_alignment (0),
 807     ptr_mask (0),
 808     ivexpr_map (NULL),
 809     scan_map (NULL),
 810     slp_unrolling_factor (1),
 811     single_scalar_iteration_cost (0),
 812     vec_outside_cost (0),
 813     vec_inside_cost (0),
 814     vectorizable (false),
 815     can_fully_mask_p (true),
 816     fully_masked_p (false),
 817     peeling_for_gaps (false),
 818     peeling_for_niter (false),
 819     no_data_dependencies (false),
 820     has_mask_store (false),
 821     scalar_loop_scaling (profile_probability::uninitialized ()),
 822     scalar_loop (NULL),
 823     orig_loop_info (NULL)
 824 {
 825   /* CHECKME: We want to visit all BBs before their successors (except for
 826      latch blocks, for which this assertion wouldn't hold).  In the simple
 827      case of the loop forms we allow, a dfs order of the BBs would the same
 828      as reversed postorder traversal, so we are safe.  */
 829
 830   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 831                                           bbs, loop->num_nodes, loop);
 832   gcc_assert (nbbs == loop->num_nodes);
 833
 834   for (unsigned int i = 0; i < nbbs; i++)
 835     {
 836       basic_block bb = bbs[i];
 837       gimple_stmt_iterator si;
 838
 839       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 840         {
 841           gimple *phi = gsi_stmt (si);
 842           gimple_set_uid (phi, 0);
 843           add_stmt (phi);
 844         }
 845
 846       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 847         {
 848           gimple *stmt = gsi_stmt (si);
 849           gimple_set_uid (stmt, 0);
 850           add_stmt (stmt);
 851           /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
 852              third argument is the #pragma omp simd if (x) condition, when 0,
 853              loop shouldn't be vectorized, when non-zero constant, it should
 854              be vectorized normally, otherwise versioned with vectorized loop
 855              done if the condition is non-zero at runtime.  */
 856           if (loop_in->simduid
 857               && is_gimple_call (stmt)
 858               && gimple_call_internal_p (stmt)
 859               && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
 860               && gimple_call_num_args (stmt) >= 3
 861               && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
 862               && (loop_in->simduid
 863                   == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
 864             {
 865               tree arg = gimple_call_arg (stmt, 2);
 866               if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
 867                 simd_if_cond = arg;
 868               else
 869                 gcc_assert (integer_nonzerop (arg));
 870             }
 871         }
 872     }
 873
 874   epilogue_vinfos.create (6);
 875 }
 876
 877 /* Free all levels of MASKS.  */
 878
 879 void
 880 release_vec_loop_masks (vec_loop_masks *masks)
 881 {
 882   rgroup_masks *rgm;
 883   unsigned int i;
 884   FOR_EACH_VEC_ELT (*masks, i, rgm)
 885     rgm->masks.release ();
 886   masks->release ();
 887 }
 888
 889 /* Free all memory used by the _loop_vec_info, as well as all the
 890    stmt_vec_info structs of all the stmts in the loop.  */
 891
 892 _loop_vec_info::~_loop_vec_info ()
 893 {
 894   free (bbs);
 895
 896   release_vec_loop_masks (&masks);
 897   delete ivexpr_map;
 898   delete scan_map;
 899   epilogue_vinfos.release ();
 900
 901   loop->aux = NULL;
 902 }
 903
 904 /* Return an invariant or register for EXPR and emit necessary
 905    computations in the LOOP_VINFO loop preheader.  */
 906
 907 tree
 908 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
 909 {
 910   if (is_gimple_reg (expr)
 911       || is_gimple_min_invariant (expr))
 912     return expr;
 913
 914   if (! loop_vinfo->ivexpr_map)
 915     loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
 916   tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
 917   if (! cached)
 918     {
 919       gimple_seq stmts = NULL;
 920       cached = force_gimple_operand (unshare_expr (expr),
 921                                      &stmts, true, NULL_TREE);
 922       if (stmts)
 923         {
 924           edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
 925           gsi_insert_seq_on_edge_immediate (e, stmts);
 926         }
 927     }
 928   return cached;
 929 }
 930
 931 /* Return true if we can use CMP_TYPE as the comparison type to produce
 932    all masks required to mask LOOP_VINFO.  */
 933
 934 static bool
 935 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
 936 {
 937   rgroup_masks *rgm;
 938   unsigned int i;
 939   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
 940     if (rgm->mask_type != NULL_TREE
 941         && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
 942                                             cmp_type, rgm->mask_type,
 943                                             OPTIMIZE_FOR_SPEED))
 944       return false;
 945   return true;
 946 }
 947
 948 /* Calculate the maximum number of scalars per iteration for every
 949    rgroup in LOOP_VINFO.  */
 950
 951 static unsigned int
 952 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
 953 {
 954   unsigned int res = 1;
 955   unsigned int i;
 956   rgroup_masks *rgm;
 957   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
 958     res = MAX (res, rgm->max_nscalars_per_iter);
 959   return res;
 960 }
 961
 962 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
 963    whether we can actually generate the masks required.  Return true if so,
 964    storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE.  */
 965
 966 static bool
 967 vect_verify_full_masking (loop_vec_info loop_vinfo)
 968 {
 969   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 970   unsigned int min_ni_width;
 971   unsigned int max_nscalars_per_iter
 972     = vect_get_max_nscalars_per_iter (loop_vinfo);
 973
 974   /* Use a normal loop if there are no statements that need masking.
 975      This only happens in rare degenerate cases: it means that the loop
 976      has no loads, no stores, and no live-out values.  */
 977   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
 978     return false;
 979
 980   /* Get the maximum number of iterations that is representable
 981      in the counter type.  */
 982   tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
 983   widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
 984
 985   /* Get a more refined estimate for the number of iterations.  */
 986   widest_int max_back_edges;
 987   if (max_loop_iterations (loop, &max_back_edges))
 988     max_ni = wi::smin (max_ni, max_back_edges + 1);
 989
 990   /* Account for rgroup masks, in which each bit is replicated N times.  */
 991   max_ni *= max_nscalars_per_iter;
 992
 993   /* Work out how many bits we need to represent the limit.  */
 994   min_ni_width = wi::min_precision (max_ni, UNSIGNED);
 995
 996   /* Find a scalar mode for which WHILE_ULT is supported.  */
 997   opt_scalar_int_mode cmp_mode_iter;
 998   tree cmp_type = NULL_TREE;
 999   tree iv_type = NULL_TREE;
1000   widest_int iv_limit = vect_iv_limit_for_full_masking (loop_vinfo);
1001   unsigned int iv_precision = UINT_MAX;
1002
1003   if (iv_limit != -1)
1004     iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1005                                       UNSIGNED);
1006
1007   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1008     {
1009       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1010       if (cmp_bits >= min_ni_width
1011           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1012         {
1013           tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1014           if (this_type
1015               && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1016             {
1017               /* Although we could stop as soon as we find a valid mode,
1018                  there are at least two reasons why that's not always the
1019                  best choice:
1020
1021                  - An IV that's Pmode or wider is more likely to be reusable
1022                    in address calculations than an IV that's narrower than
1023                    Pmode.
1024
1025                  - Doing the comparison in IV_PRECISION or wider allows
1026                    a natural 0-based IV, whereas using a narrower comparison
1027                    type requires mitigations against wrap-around.
1028
1029                  Conversely, if the IV limit is variable, doing the comparison
1030                  in a wider type than the original type can introduce
1031                  unnecessary extensions, so picking the widest valid mode
1032                  is not always a good choice either.
1033
1034                  Here we prefer the first IV type that's Pmode or wider,
1035                  and the first comparison type that's IV_PRECISION or wider.
1036                  (The comparison type must be no wider than the IV type,
1037                  to avoid extensions in the vector loop.)
1038
1039                  ??? We might want to try continuing beyond Pmode for ILP32
1040                  targets if CMP_BITS < IV_PRECISION.  */
1041               iv_type = this_type;
1042               if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1043                 cmp_type = this_type;
1044               if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1045                 break;
1046             }
1047         }
1048     }
1049
1050   if (!cmp_type)
1051     return false;
1052
1053   LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
1054   LOOP_VINFO_MASK_IV_TYPE (loop_vinfo) = iv_type;
1055   return true;
1056 }
1057
1058 /* Calculate the cost of one scalar iteration of the loop.  */
1059 static void
1060 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1061 {
1062   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1063   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1064   int nbbs = loop->num_nodes, factor;
1065   int innerloop_iters, i;
1066
1067   DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1068
1069   /* Gather costs for statements in the scalar loop.  */
1070
1071   /* FORNOW.  */
1072   innerloop_iters = 1;
1073   if (loop->inner)
1074     innerloop_iters = 50; /* FIXME */
1075
1076   for (i = 0; i < nbbs; i++)
1077     {
1078       gimple_stmt_iterator si;
1079       basic_block bb = bbs[i];
1080
1081       if (bb->loop_father == loop->inner)
1082         factor = innerloop_iters;
1083       else
1084         factor = 1;
1085
1086       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1087         {
1088           gimple *stmt = gsi_stmt (si);
1089           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1090
1091           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1092             continue;
1093
1094           /* Skip stmts that are not vectorized inside the loop.  */
1095           stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1096           if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1097               && (!STMT_VINFO_LIVE_P (vstmt_info)
1098                   || !VECTORIZABLE_CYCLE_DEF
1099                         (STMT_VINFO_DEF_TYPE (vstmt_info))))
1100             continue;
1101
1102           vect_cost_for_stmt kind;
1103           if (STMT_VINFO_DATA_REF (stmt_info))
1104             {
1105               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1106                kind = scalar_load;
1107              else
1108                kind = scalar_store;
1109             }
1110           else if (vect_nop_conversion_p (stmt_info))
1111             continue;
1112           else
1113             kind = scalar_stmt;
1114
1115           record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1116                             factor, kind, stmt_info, 0, vect_prologue);
1117         }
1118     }
1119
1120   /* Now accumulate cost.  */
1121   void *target_cost_data = init_cost (loop);
1122   stmt_info_for_cost *si;
1123   int j;
1124   FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1125                     j, si)
1126     (void) add_stmt_cost (loop_vinfo, target_cost_data, si->count,
1127                           si->kind, si->stmt_info, si->misalign,
1128                           vect_body);
1129   unsigned dummy, body_cost = 0;
1130   finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
1131   destroy_cost_data (target_cost_data);
1132   LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
1133 }
1134
1135
1136 /* Function vect_analyze_loop_form_1.
1137
1138    Verify that certain CFG restrictions hold, including:
1139    - the loop has a pre-header
1140    - the loop has a single entry and exit
1141    - the loop exit condition is simple enough
1142    - the number of iterations can be analyzed, i.e, a countable loop.  The
1143      niter could be analyzed under some assumptions.  */
1144
1145 opt_result
1146 vect_analyze_loop_form_1 (class loop *loop, gcond **loop_cond,
1147                           tree *assumptions, tree *number_of_iterationsm1,
1148                           tree *number_of_iterations, gcond **inner_loop_cond)
1149 {
1150   DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1151
1152   /* Different restrictions apply when we are considering an inner-most loop,
1153      vs. an outer (nested) loop.
1154      (FORNOW. May want to relax some of these restrictions in the future).  */
1155
1156   if (!loop->inner)
1157     {
1158       /* Inner-most loop.  We currently require that the number of BBs is
1159          exactly 2 (the header and latch).  Vectorizable inner-most loops
1160          look like this:
1161
1162                         (pre-header)
1163                            |
1164                           header <--------+
1165                            | |            |
1166                            | +--> latch --+
1167                            |
1168                         (exit-bb)  */
1169
1170       if (loop->num_nodes != 2)
1171         return opt_result::failure_at (vect_location,
1172                                        "not vectorized:"
1173                                        " control flow in loop.\n");
1174
1175       if (empty_block_p (loop->header))
1176         return opt_result::failure_at (vect_location,
1177                                        "not vectorized: empty loop.\n");
1178     }
1179   else
1180     {
1181       class loop *innerloop = loop->inner;
1182       edge entryedge;
1183
1184       /* Nested loop. We currently require that the loop is doubly-nested,
1185          contains a single inner loop, and the number of BBs is exactly 5.
1186          Vectorizable outer-loops look like this:
1187
1188                         (pre-header)
1189                            |
1190                           header <---+
1191                            |         |
1192                           inner-loop |
1193                            |         |
1194                           tail ------+
1195                            |
1196                         (exit-bb)
1197
1198          The inner-loop has the properties expected of inner-most loops
1199          as described above.  */
1200
1201       if ((loop->inner)->inner || (loop->inner)->next)
1202         return opt_result::failure_at (vect_location,
1203                                        "not vectorized:"
1204                                        " multiple nested loops.\n");
1205
1206       if (loop->num_nodes != 5)
1207         return opt_result::failure_at (vect_location,
1208                                        "not vectorized:"
1209                                        " control flow in loop.\n");
1210
1211       entryedge = loop_preheader_edge (innerloop);
1212       if (entryedge->src != loop->header
1213           || !single_exit (innerloop)
1214           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1215         return opt_result::failure_at (vect_location,
1216                                        "not vectorized:"
1217                                        " unsupported outerloop form.\n");
1218
1219       /* Analyze the inner-loop.  */
1220       tree inner_niterm1, inner_niter, inner_assumptions;
1221       opt_result res
1222         = vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1223                                     &inner_assumptions, &inner_niterm1,
1224                                     &inner_niter, NULL);
1225       if (!res)
1226         {
1227           if (dump_enabled_p ())
1228             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1229                              "not vectorized: Bad inner loop.\n");
1230           return res;
1231         }
1232
1233       /* Don't support analyzing niter under assumptions for inner
1234          loop.  */
1235       if (!integer_onep (inner_assumptions))
1236         return opt_result::failure_at (vect_location,
1237                                        "not vectorized: Bad inner loop.\n");
1238
1239       if (!expr_invariant_in_loop_p (loop, inner_niter))
1240         return opt_result::failure_at (vect_location,
1241                                        "not vectorized: inner-loop count not"
1242                                        " invariant.\n");
1243
1244       if (dump_enabled_p ())
1245         dump_printf_loc (MSG_NOTE, vect_location,
1246                          "Considering outer-loop vectorization.\n");
1247     }
1248
1249   if (!single_exit (loop))
1250     return opt_result::failure_at (vect_location,
1251                                    "not vectorized: multiple exits.\n");
1252   if (EDGE_COUNT (loop->header->preds) != 2)
1253     return opt_result::failure_at (vect_location,
1254                                    "not vectorized:"
1255                                    " too many incoming edges.\n");
1256
1257   /* We assume that the loop exit condition is at the end of the loop. i.e,
1258      that the loop is represented as a do-while (with a proper if-guard
1259      before the loop if needed), where the loop header contains all the
1260      executable statements, and the latch is empty.  */
1261   if (!empty_block_p (loop->latch)
1262       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1263     return opt_result::failure_at (vect_location,
1264                                    "not vectorized: latch block not empty.\n");
1265
1266   /* Make sure the exit is not abnormal.  */
1267   edge e = single_exit (loop);
1268   if (e->flags & EDGE_ABNORMAL)
1269     return opt_result::failure_at (vect_location,
1270                                    "not vectorized:"
1271                                    " abnormal loop exit edge.\n");
1272
1273   *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1274                                      number_of_iterationsm1);
1275   if (!*loop_cond)
1276     return opt_result::failure_at
1277       (vect_location,
1278        "not vectorized: complicated exit condition.\n");
1279
1280   if (integer_zerop (*assumptions)
1281       || !*number_of_iterations
1282       || chrec_contains_undetermined (*number_of_iterations))
1283     return opt_result::failure_at
1284       (*loop_cond,
1285        "not vectorized: number of iterations cannot be computed.\n");
1286
1287   if (integer_zerop (*number_of_iterations))
1288     return opt_result::failure_at
1289       (*loop_cond,
1290        "not vectorized: number of iterations = 0.\n");
1291
1292   return opt_result::success ();
1293 }
1294
1295 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form.  */
1296
1297 opt_loop_vec_info
1298 vect_analyze_loop_form (class loop *loop, vec_info_shared *shared)
1299 {
1300   tree assumptions, number_of_iterations, number_of_iterationsm1;
1301   gcond *loop_cond, *inner_loop_cond = NULL;
1302
1303   opt_result res
1304     = vect_analyze_loop_form_1 (loop, &loop_cond,
1305                                 &assumptions, &number_of_iterationsm1,
1306                                 &number_of_iterations, &inner_loop_cond);
1307   if (!res)
1308     return opt_loop_vec_info::propagate_failure (res);
1309
1310   loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1311   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1312   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1313   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1314   if (!integer_onep (assumptions))
1315     {
1316       /* We consider to vectorize this loop by versioning it under
1317          some assumptions.  In order to do this, we need to clear
1318          existing information computed by scev and niter analyzer.  */
1319       scev_reset_htab ();
1320       free_numbers_of_iterations_estimates (loop);
1321       /* Also set flag for this loop so that following scev and niter
1322          analysis are done under the assumptions.  */
1323       loop_constraint_set (loop, LOOP_C_FINITE);
1324       /* Also record the assumptions for versioning.  */
1325       LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1326     }
1327
1328   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1329     {
1330       if (dump_enabled_p ())
1331         {
1332           dump_printf_loc (MSG_NOTE, vect_location,
1333                            "Symbolic number of iterations is ");
1334           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1335           dump_printf (MSG_NOTE, "\n");
1336         }
1337     }
1338
1339   stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (loop_cond);
1340   STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1341   if (inner_loop_cond)
1342     {
1343       stmt_vec_info inner_loop_cond_info
1344         = loop_vinfo->lookup_stmt (inner_loop_cond);
1345       STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1346     }
1347
1348   gcc_assert (!loop->aux);
1349   loop->aux = loop_vinfo;
1350   return opt_loop_vec_info::success (loop_vinfo);
1351 }
1352
1353
1354
1355 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1356    statements update the vectorization factor.  */
1357
1358 static void
1359 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1360 {
1361   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1362   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1363   int nbbs = loop->num_nodes;
1364   poly_uint64 vectorization_factor;
1365   int i;
1366
1367   DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1368
1369   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1370   gcc_assert (known_ne (vectorization_factor, 0U));
1371
1372   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1373      vectorization factor of the loop is the unrolling factor required by
1374      the SLP instances.  If that unrolling factor is 1, we say, that we
1375      perform pure SLP on loop - cross iteration parallelism is not
1376      exploited.  */
1377   bool only_slp_in_loop = true;
1378   for (i = 0; i < nbbs; i++)
1379     {
1380       basic_block bb = bbs[i];
1381       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1382            gsi_next (&si))
1383         {
1384           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1385           if (!stmt_info)
1386             continue;
1387           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1388                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1389               && !PURE_SLP_STMT (stmt_info))
1390             /* STMT needs both SLP and loop-based vectorization.  */
1391             only_slp_in_loop = false;
1392         }
1393       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1394            gsi_next (&si))
1395         {
1396           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1397           stmt_info = vect_stmt_to_vectorize (stmt_info);
1398           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1399                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1400               && !PURE_SLP_STMT (stmt_info))
1401             /* STMT needs both SLP and loop-based vectorization.  */
1402             only_slp_in_loop = false;
1403         }
1404     }
1405
1406   if (only_slp_in_loop)
1407     {
1408       if (dump_enabled_p ())
1409         dump_printf_loc (MSG_NOTE, vect_location,
1410                          "Loop contains only SLP stmts\n");
1411       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1412     }
1413   else
1414     {
1415       if (dump_enabled_p ())
1416         dump_printf_loc (MSG_NOTE, vect_location,
1417                          "Loop contains SLP and non-SLP stmts\n");
1418       /* Both the vectorization factor and unroll factor have the form
1419          GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
1420          so they must have a common multiple.  */
1421       vectorization_factor
1422         = force_common_multiple (vectorization_factor,
1423                                  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1424     }
1425
1426   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1427   if (dump_enabled_p ())
1428     {
1429       dump_printf_loc (MSG_NOTE, vect_location,
1430                        "Updating vectorization factor to ");
1431       dump_dec (MSG_NOTE, vectorization_factor);
1432       dump_printf (MSG_NOTE, ".\n");
1433     }
1434 }
1435
1436 /* Return true if STMT_INFO describes a double reduction phi and if
1437    the other phi in the reduction is also relevant for vectorization.
1438    This rejects cases such as:
1439
1440       outer1:
1441         x_1 = PHI <x_3(outer2), ...>;
1442         ...
1443
1444       inner:
1445         x_2 = ...;
1446         ...
1447
1448       outer2:
1449         x_3 = PHI <x_2(inner)>;
1450
1451    if nothing in x_2 or elsewhere makes x_1 relevant.  */
1452
1453 static bool
1454 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1455 {
1456   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1457     return false;
1458
1459   return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1460 }
1461
1462 /* Function vect_analyze_loop_operations.
1463
1464    Scan the loop stmts and make sure they are all vectorizable.  */
1465
1466 static opt_result
1467 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1468 {
1469   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1470   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1471   int nbbs = loop->num_nodes;
1472   int i;
1473   stmt_vec_info stmt_info;
1474   bool need_to_vectorize = false;
1475   bool ok;
1476
1477   DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1478
1479   auto_vec<stmt_info_for_cost> cost_vec;
1480
1481   for (i = 0; i < nbbs; i++)
1482     {
1483       basic_block bb = bbs[i];
1484
1485       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1486            gsi_next (&si))
1487         {
1488           gphi *phi = si.phi ();
1489           ok = true;
1490
1491           stmt_info = loop_vinfo->lookup_stmt (phi);
1492           if (dump_enabled_p ())
1493             dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G", phi);
1494           if (virtual_operand_p (gimple_phi_result (phi)))
1495             continue;
1496
1497           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1498              (i.e., a phi in the tail of the outer-loop).  */
1499           if (! is_loop_header_bb_p (bb))
1500             {
1501               /* FORNOW: we currently don't support the case that these phis
1502                  are not used in the outerloop (unless it is double reduction,
1503                  i.e., this phi is vect_reduction_def), cause this case
1504                  requires to actually do something here.  */
1505               if (STMT_VINFO_LIVE_P (stmt_info)
1506                   && !vect_active_double_reduction_p (stmt_info))
1507                 return opt_result::failure_at (phi,
1508                                                "Unsupported loop-closed phi"
1509                                                " in outer-loop.\n");
1510
1511               /* If PHI is used in the outer loop, we check that its operand
1512                  is defined in the inner loop.  */
1513               if (STMT_VINFO_RELEVANT_P (stmt_info))
1514                 {
1515                   tree phi_op;
1516
1517                   if (gimple_phi_num_args (phi) != 1)
1518                     return opt_result::failure_at (phi, "unsupported phi");
1519
1520                   phi_op = PHI_ARG_DEF (phi, 0);
1521                   stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1522                   if (!op_def_info)
1523                     return opt_result::failure_at (phi, "unsupported phi\n");
1524
1525                   if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1526                       && (STMT_VINFO_RELEVANT (op_def_info)
1527                           != vect_used_in_outer_by_reduction))
1528                     return opt_result::failure_at (phi, "unsupported phi\n");
1529
1530                   if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
1531                        || (STMT_VINFO_DEF_TYPE (stmt_info)
1532                            == vect_double_reduction_def))
1533                       && !vectorizable_lc_phi (loop_vinfo,
1534                                                stmt_info, NULL, NULL))
1535                     return opt_result::failure_at (phi, "unsupported phi\n");
1536                 }
1537
1538               continue;
1539             }
1540
1541           gcc_assert (stmt_info);
1542
1543           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1544                || STMT_VINFO_LIVE_P (stmt_info))
1545               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1546             /* A scalar-dependence cycle that we don't support.  */
1547             return opt_result::failure_at (phi,
1548                                            "not vectorized:"
1549                                            " scalar dependence cycle.\n");
1550
1551           if (STMT_VINFO_RELEVANT_P (stmt_info))
1552             {
1553               need_to_vectorize = true;
1554               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1555                   && ! PURE_SLP_STMT (stmt_info))
1556                 ok = vectorizable_induction (loop_vinfo,
1557                                              stmt_info, NULL, NULL, NULL,
1558                                              &cost_vec);
1559               else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1560                         || (STMT_VINFO_DEF_TYPE (stmt_info)
1561                             == vect_double_reduction_def)
1562                         || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1563                        && ! PURE_SLP_STMT (stmt_info))
1564                 ok = vectorizable_reduction (loop_vinfo,
1565                                              stmt_info, NULL, NULL, &cost_vec);
1566             }
1567
1568           /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
1569           if (ok
1570               && STMT_VINFO_LIVE_P (stmt_info)
1571               && !PURE_SLP_STMT (stmt_info))
1572             ok = vectorizable_live_operation (loop_vinfo,
1573                                               stmt_info, NULL, NULL, NULL,
1574                                               -1, false, &cost_vec);
1575
1576           if (!ok)
1577             return opt_result::failure_at (phi,
1578                                            "not vectorized: relevant phi not "
1579                                            "supported: %G",
1580                                            static_cast <gimple *> (phi));
1581         }
1582
1583       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1584            gsi_next (&si))
1585         {
1586           gimple *stmt = gsi_stmt (si);
1587           if (!gimple_clobber_p (stmt))
1588             {
1589               opt_result res
1590                 = vect_analyze_stmt (loop_vinfo,
1591                                      loop_vinfo->lookup_stmt (stmt),
1592                                      &need_to_vectorize,
1593                                      NULL, NULL, &cost_vec);
1594               if (!res)
1595                 return res;
1596             }
1597         }
1598     } /* bbs */
1599
1600   add_stmt_costs (loop_vinfo, loop_vinfo->target_cost_data, &cost_vec);
1601
1602   /* All operations in the loop are either irrelevant (deal with loop
1603      control, or dead), or only used outside the loop and can be moved
1604      out of the loop (e.g. invariants, inductions).  The loop can be
1605      optimized away by scalar optimizations.  We're better off not
1606      touching this loop.  */
1607   if (!need_to_vectorize)
1608     {
1609       if (dump_enabled_p ())
1610         dump_printf_loc (MSG_NOTE, vect_location,
1611                          "All the computation can be taken out of the loop.\n");
1612       return opt_result::failure_at
1613         (vect_location,
1614          "not vectorized: redundant loop. no profit to vectorize.\n");
1615     }
1616
1617   return opt_result::success ();
1618 }
1619
1620 /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
1621    is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
1622    definitely no, or -1 if it's worth retrying.  */
1623
1624 static int
1625 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1626 {
1627   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1628   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1629
1630   /* Only fully-masked loops can have iteration counts less than the
1631      vectorization factor.  */
1632   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1633     {
1634       HOST_WIDE_INT max_niter;
1635
1636       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1637         max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1638       else
1639         max_niter = max_stmt_executions_int (loop);
1640
1641       if (max_niter != -1
1642           && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1643         {
1644           if (dump_enabled_p ())
1645             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1646                              "not vectorized: iteration count smaller than "
1647                              "vectorization factor.\n");
1648           return 0;
1649         }
1650     }
1651
1652   int min_profitable_iters, min_profitable_estimate;
1653   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1654                                       &min_profitable_estimate);
1655
1656   if (min_profitable_iters < 0)
1657     {
1658       if (dump_enabled_p ())
1659         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1660                          "not vectorized: vectorization not profitable.\n");
1661       if (dump_enabled_p ())
1662         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1663                          "not vectorized: vector version will never be "
1664                          "profitable.\n");
1665       return -1;
1666     }
1667
1668   int min_scalar_loop_bound = (param_min_vect_loop_bound
1669                                * assumed_vf);
1670
1671   /* Use the cost model only if it is more conservative than user specified
1672      threshold.  */
1673   unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1674                                     min_profitable_iters);
1675
1676   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1677
1678   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1679       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1680     {
1681       if (dump_enabled_p ())
1682         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1683                          "not vectorized: vectorization not profitable.\n");
1684       if (dump_enabled_p ())
1685         dump_printf_loc (MSG_NOTE, vect_location,
1686                          "not vectorized: iteration count smaller than user "
1687                          "specified loop bound parameter or minimum profitable "
1688                          "iterations (whichever is more conservative).\n");
1689       return 0;
1690     }
1691
1692   /* The static profitablity threshold min_profitable_estimate includes
1693      the cost of having to check at runtime whether the scalar loop
1694      should be used instead.  If it turns out that we don't need or want
1695      such a check, the threshold we should use for the static estimate
1696      is simply the point at which the vector loop becomes more profitable
1697      than the scalar loop.  */
1698   if (min_profitable_estimate > min_profitable_iters
1699       && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
1700       && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
1701       && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1702       && !vect_apply_runtime_profitability_check_p (loop_vinfo))
1703     {
1704       if (dump_enabled_p ())
1705         dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
1706                          " choice between the scalar and vector loops\n");
1707       min_profitable_estimate = min_profitable_iters;
1708     }
1709
1710   HOST_WIDE_INT estimated_niter;
1711
1712   /* If we are vectorizing an epilogue then we know the maximum number of
1713      scalar iterations it will cover is at least one lower than the
1714      vectorization factor of the main loop.  */
1715   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1716     estimated_niter
1717       = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
1718   else
1719     {
1720       estimated_niter = estimated_stmt_executions_int (loop);
1721       if (estimated_niter == -1)
1722         estimated_niter = likely_max_stmt_executions_int (loop);
1723     }
1724   if (estimated_niter != -1
1725       && ((unsigned HOST_WIDE_INT) estimated_niter
1726           < MAX (th, (unsigned) min_profitable_estimate)))
1727     {
1728       if (dump_enabled_p ())
1729         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1730                          "not vectorized: estimated iteration count too "
1731                          "small.\n");
1732       if (dump_enabled_p ())
1733         dump_printf_loc (MSG_NOTE, vect_location,
1734                          "not vectorized: estimated iteration count smaller "
1735                          "than specified loop bound parameter or minimum "
1736                          "profitable iterations (whichever is more "
1737                          "conservative).\n");
1738       return -1;
1739     }
1740
1741   return 1;
1742 }
1743
1744 static opt_result
1745 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1746                            vec<data_reference_p> *datarefs,
1747                            unsigned int *n_stmts)
1748 {
1749   *n_stmts = 0;
1750   for (unsigned i = 0; i < loop->num_nodes; i++)
1751     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1752          !gsi_end_p (gsi); gsi_next (&gsi))
1753       {
1754         gimple *stmt = gsi_stmt (gsi);
1755         if (is_gimple_debug (stmt))
1756           continue;
1757         ++(*n_stmts);
1758         opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs);
1759         if (!res)
1760           {
1761             if (is_gimple_call (stmt) && loop->safelen)
1762               {
1763                 tree fndecl = gimple_call_fndecl (stmt), op;
1764                 if (fndecl != NULL_TREE)
1765                   {
1766                     cgraph_node *node = cgraph_node::get (fndecl);
1767                     if (node != NULL && node->simd_clones != NULL)
1768                       {
1769                         unsigned int j, n = gimple_call_num_args (stmt);
1770                         for (j = 0; j < n; j++)
1771                           {
1772                             op = gimple_call_arg (stmt, j);
1773                             if (DECL_P (op)
1774                                 || (REFERENCE_CLASS_P (op)
1775                                     && get_base_address (op)))
1776                               break;
1777                           }
1778                         op = gimple_call_lhs (stmt);
1779                         /* Ignore #pragma omp declare simd functions
1780                            if they don't have data references in the
1781                            call stmt itself.  */
1782                         if (j == n
1783                             && !(op
1784                                  && (DECL_P (op)
1785                                      || (REFERENCE_CLASS_P (op)
1786                                          && get_base_address (op)))))
1787                           continue;
1788                       }
1789                   }
1790               }
1791             return res;
1792           }
1793         /* If dependence analysis will give up due to the limit on the
1794            number of datarefs stop here and fail fatally.  */
1795         if (datarefs->length ()
1796             > (unsigned)param_loop_max_datarefs_for_datadeps)
1797           return opt_result::failure_at (stmt, "exceeded param "
1798                                          "loop-max-datarefs-for-datadeps\n");
1799       }
1800   return opt_result::success ();
1801 }
1802
1803 /* Look for SLP-only access groups and turn each individual access into its own
1804    group.  */
1805 static void
1806 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
1807 {
1808   unsigned int i;
1809   struct data_reference *dr;
1810
1811   DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
1812
1813   vec<data_reference_p> datarefs = loop_vinfo->shared->datarefs;
1814   FOR_EACH_VEC_ELT (datarefs, i, dr)
1815     {
1816       gcc_assert (DR_REF (dr));
1817       stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
1818
1819       /* Check if the load is a part of an interleaving chain.  */
1820       if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1821         {
1822           stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
1823           unsigned int group_size = DR_GROUP_SIZE (first_element);
1824
1825           /* Check if SLP-only groups.  */
1826           if (!STMT_SLP_TYPE (stmt_info)
1827               && STMT_VINFO_SLP_VECT_ONLY (first_element))
1828             {
1829               /* Dissolve the group.  */
1830               STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
1831
1832               stmt_vec_info vinfo = first_element;
1833               while (vinfo)
1834                 {
1835                   stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
1836                   DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
1837                   DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
1838                   DR_GROUP_SIZE (vinfo) = 1;
1839                   if (STMT_VINFO_STRIDED_P (first_element))
1840                     DR_GROUP_GAP (vinfo) = 0;
1841                   else
1842                     DR_GROUP_GAP (vinfo) = group_size - 1;
1843                   vinfo = next;
1844                 }
1845             }
1846         }
1847     }
1848 }
1849
1850
1851 /* Decides whether we need to create an epilogue loop to handle
1852    remaining scalar iterations and sets PEELING_FOR_NITERS accordingly.  */
1853
1854 void
1855 determine_peel_for_niter (loop_vec_info loop_vinfo)
1856 {
1857   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
1858
1859   unsigned HOST_WIDE_INT const_vf;
1860   HOST_WIDE_INT max_niter
1861     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1862
1863   unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1864   if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1865     th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1866                                           (loop_vinfo));
1867
1868   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1869     /* The main loop handles all iterations.  */
1870     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
1871   else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1872            && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1873     {
1874       /* Work out the (constant) number of iterations that need to be
1875          peeled for reasons other than niters.  */
1876       unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1877       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1878         peel_niter += 1;
1879       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1880                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1881         LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1882     }
1883   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1884            /* ??? When peeling for gaps but not alignment, we could
1885               try to check whether the (variable) niters is known to be
1886               VF * N + 1.  That's something of a niche case though.  */
1887            || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1888            || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1889            || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1890                 < (unsigned) exact_log2 (const_vf))
1891                /* In case of versioning, check if the maximum number of
1892                   iterations is greater than th.  If they are identical,
1893                   the epilogue is unnecessary.  */
1894                && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1895                    || ((unsigned HOST_WIDE_INT) max_niter
1896                        > (th / const_vf) * const_vf))))
1897     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1898 }
1899
1900
1901 /* Function vect_analyze_loop_2.
1902
1903    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1904    for it.  The different analyses will record information in the
1905    loop_vec_info struct.  */
1906 static opt_result
1907 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
1908 {
1909   opt_result ok = opt_result::success ();
1910   int res;
1911   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
1912   poly_uint64 min_vf = 2;
1913   loop_vec_info orig_loop_vinfo = NULL;
1914
1915   /* If we are dealing with an epilogue then orig_loop_vinfo points to the
1916      loop_vec_info of the first vectorized loop.  */
1917   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1918     orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
1919   else
1920     orig_loop_vinfo = loop_vinfo;
1921   gcc_assert (orig_loop_vinfo);
1922
1923   /* The first group of checks is independent of the vector size.  */
1924   fatal = true;
1925
1926   if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
1927       && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
1928     return opt_result::failure_at (vect_location,
1929                                    "not vectorized: simd if(0)\n");
1930
1931   /* Find all data references in the loop (which correspond to vdefs/vuses)
1932      and analyze their evolution in the loop.  */
1933
1934   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1935
1936   /* Gather the data references and count stmts in the loop.  */
1937   if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
1938     {
1939       opt_result res
1940         = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
1941                                      &LOOP_VINFO_DATAREFS (loop_vinfo),
1942                                      n_stmts);
1943       if (!res)
1944         {
1945           if (dump_enabled_p ())
1946             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1947                              "not vectorized: loop contains function "
1948                              "calls or data references that cannot "
1949                              "be analyzed\n");
1950           return res;
1951         }
1952       loop_vinfo->shared->save_datarefs ();
1953     }
1954   else
1955     loop_vinfo->shared->check_datarefs ();
1956
1957   /* Analyze the data references and also adjust the minimal
1958      vectorization factor according to the loads and stores.  */
1959
1960   ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
1961   if (!ok)
1962     {
1963       if (dump_enabled_p ())
1964         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1965                          "bad data references.\n");
1966       return ok;
1967     }
1968
1969   /* Classify all cross-iteration scalar data-flow cycles.
1970      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1971   vect_analyze_scalar_cycles (loop_vinfo);
1972
1973   vect_pattern_recog (loop_vinfo);
1974
1975   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1976
1977   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1978      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1979
1980   ok = vect_analyze_data_ref_accesses (loop_vinfo);
1981   if (!ok)
1982     {
1983       if (dump_enabled_p ())
1984         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1985                          "bad data access.\n");
1986       return ok;
1987     }
1988
1989   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1990
1991   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
1992   if (!ok)
1993     {
1994       if (dump_enabled_p ())
1995         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1996                          "unexpected pattern.\n");
1997       return ok;
1998     }
1999
2000   /* While the rest of the analysis below depends on it in some way.  */
2001   fatal = false;
2002
2003   /* Analyze data dependences between the data-refs in the loop
2004      and adjust the maximum vectorization factor according to
2005      the dependences.
2006      FORNOW: fail at the first data dependence that we encounter.  */
2007
2008   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2009   if (!ok)
2010     {
2011       if (dump_enabled_p ())
2012         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2013                          "bad data dependence.\n");
2014       return ok;
2015     }
2016   if (max_vf != MAX_VECTORIZATION_FACTOR
2017       && maybe_lt (max_vf, min_vf))
2018     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2019   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2020
2021   ok = vect_determine_vectorization_factor (loop_vinfo);
2022   if (!ok)
2023     {
2024       if (dump_enabled_p ())
2025         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2026                          "can't determine vectorization factor.\n");
2027       return ok;
2028     }
2029   if (max_vf != MAX_VECTORIZATION_FACTOR
2030       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2031     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2032
2033   /* Compute the scalar iteration cost.  */
2034   vect_compute_single_scalar_iteration_cost (loop_vinfo);
2035
2036   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2037
2038   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
2039   ok = vect_analyze_slp (loop_vinfo, *n_stmts);
2040   if (!ok)
2041     return ok;
2042
2043   /* If there are any SLP instances mark them as pure_slp.  */
2044   bool slp = vect_make_slp_decision (loop_vinfo);
2045   if (slp)
2046     {
2047       /* Find stmts that need to be both vectorized and SLPed.  */
2048       vect_detect_hybrid_slp (loop_vinfo);
2049
2050       /* Update the vectorization factor based on the SLP decision.  */
2051       vect_update_vf_for_slp (loop_vinfo);
2052     }
2053
2054   bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo);
2055
2056   /* We don't expect to have to roll back to anything other than an empty
2057      set of rgroups.  */
2058   gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2059
2060   /* This is the point where we can re-start analysis with SLP forced off.  */
2061 start_over:
2062
2063   /* Now the vectorization factor is final.  */
2064   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2065   gcc_assert (known_ne (vectorization_factor, 0U));
2066
2067   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2068     {
2069       dump_printf_loc (MSG_NOTE, vect_location,
2070                        "vectorization_factor = ");
2071       dump_dec (MSG_NOTE, vectorization_factor);
2072       dump_printf (MSG_NOTE, ", niters = %wd\n",
2073                    LOOP_VINFO_INT_NITERS (loop_vinfo));
2074     }
2075
2076   /* Analyze the alignment of the data-refs in the loop.
2077      Fail if a data reference is found that cannot be vectorized.  */
2078
2079   ok = vect_analyze_data_refs_alignment (loop_vinfo);
2080   if (!ok)
2081     {
2082       if (dump_enabled_p ())
2083         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2084                          "bad data alignment.\n");
2085       return ok;
2086     }
2087
2088   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2089      It is important to call pruning after vect_analyze_data_ref_accesses,
2090      since we use grouping information gathered by interleaving analysis.  */
2091   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2092   if (!ok)
2093     return ok;
2094
2095   /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2096      vectorization, since we do not want to add extra peeling or
2097      add versioning for alignment.  */
2098   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2099     /* This pass will decide on using loop versioning and/or loop peeling in
2100        order to enhance the alignment of data references in the loop.  */
2101     ok = vect_enhance_data_refs_alignment (loop_vinfo);
2102   else
2103     ok = vect_verify_datarefs_alignment (loop_vinfo);
2104   if (!ok)
2105     return ok;
2106
2107   if (slp)
2108     {
2109       /* Analyze operations in the SLP instances.  Note this may
2110          remove unsupported SLP instances which makes the above
2111          SLP kind detection invalid.  */
2112       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2113       vect_slp_analyze_operations (loop_vinfo);
2114       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2115         {
2116           ok = opt_result::failure_at (vect_location,
2117                                        "unsupported SLP instances\n");
2118           goto again;
2119         }
2120     }
2121
2122   /* Dissolve SLP-only groups.  */
2123   vect_dissolve_slp_only_groups (loop_vinfo);
2124
2125   /* Scan all the remaining operations in the loop that are not subject
2126      to SLP and make sure they are vectorizable.  */
2127   ok = vect_analyze_loop_operations (loop_vinfo);
2128   if (!ok)
2129     {
2130       if (dump_enabled_p ())
2131         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2132                          "bad operation or unsupported loop bound.\n");
2133       return ok;
2134     }
2135
2136   /* Decide whether to use a fully-masked loop for this vectorization
2137      factor.  */
2138   LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2139     = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
2140        && vect_verify_full_masking (loop_vinfo));
2141   if (dump_enabled_p ())
2142     {
2143       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2144         dump_printf_loc (MSG_NOTE, vect_location,
2145                          "using a fully-masked loop.\n");
2146       else
2147         dump_printf_loc (MSG_NOTE, vect_location,
2148                          "not using a fully-masked loop.\n");
2149     }
2150
2151   /* If epilog loop is required because of data accesses with gaps,
2152      one additional iteration needs to be peeled.  Check if there is
2153      enough iterations for vectorization.  */
2154   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2155       && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2156       && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2157     {
2158       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2159       tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2160
2161       if (known_lt (wi::to_widest (scalar_niters), vf))
2162         return opt_result::failure_at (vect_location,
2163                                        "loop has no enough iterations to"
2164                                        " support peeling for gaps.\n");
2165     }
2166
2167   /* If we're vectorizing an epilogue loop, we either need a fully-masked
2168      loop or a loop that has a lower VF than the main loop.  */
2169   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2170       && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2171       && maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2172                    LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)))
2173     return opt_result::failure_at (vect_location,
2174                                    "Vectorization factor too high for"
2175                                    " epilogue loop.\n");
2176
2177   /* Check the costings of the loop make vectorizing worthwhile.  */
2178   res = vect_analyze_loop_costing (loop_vinfo);
2179   if (res < 0)
2180     {
2181       ok = opt_result::failure_at (vect_location,
2182                                    "Loop costings may not be worthwhile.\n");
2183       goto again;
2184     }
2185   if (!res)
2186     return opt_result::failure_at (vect_location,
2187                                    "Loop costings not worthwhile.\n");
2188
2189   determine_peel_for_niter (loop_vinfo);
2190   /* If an epilogue loop is required make sure we can create one.  */
2191   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2192       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2193     {
2194       if (dump_enabled_p ())
2195         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2196       if (!vect_can_advance_ivs_p (loop_vinfo)
2197           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2198                                            single_exit (LOOP_VINFO_LOOP
2199                                                          (loop_vinfo))))
2200         {
2201           ok = opt_result::failure_at (vect_location,
2202                                        "not vectorized: can't create required "
2203                                        "epilog loop\n");
2204           goto again;
2205         }
2206     }
2207
2208   /* During peeling, we need to check if number of loop iterations is
2209      enough for both peeled prolog loop and vector loop.  This check
2210      can be merged along with threshold check of loop versioning, so
2211      increase threshold for this case if necessary.
2212
2213      If we are analyzing an epilogue we still want to check what its
2214      versioning threshold would be.  If we decide to vectorize the epilogues we
2215      will want to use the lowest versioning threshold of all epilogues and main
2216      loop.  This will enable us to enter a vectorized epilogue even when
2217      versioning the loop.  We can't simply check whether the epilogue requires
2218      versioning though since we may have skipped some versioning checks when
2219      analyzing the epilogue.  For instance, checks for alias versioning will be
2220      skipped when dealing with epilogues as we assume we already checked them
2221      for the main loop.  So instead we always check the 'orig_loop_vinfo'.  */
2222   if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
2223     {
2224       poly_uint64 niters_th = 0;
2225       unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2226
2227       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2228         {
2229           /* Niters for peeled prolog loop.  */
2230           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2231             {
2232               dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2233               tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2234               niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2235             }
2236           else
2237             niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2238         }
2239
2240       /* Niters for at least one iteration of vectorized loop.  */
2241       if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2242         niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2243       /* One additional iteration because of peeling for gap.  */
2244       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2245         niters_th += 1;
2246
2247       /*  Use the same condition as vect_transform_loop to decide when to use
2248           the cost to determine a versioning threshold.  */
2249       if (vect_apply_runtime_profitability_check_p (loop_vinfo)
2250           && ordered_p (th, niters_th))
2251         niters_th = ordered_max (poly_uint64 (th), niters_th);
2252
2253       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2254     }
2255
2256   gcc_assert (known_eq (vectorization_factor,
2257                         LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2258
2259   /* Ok to vectorize!  */
2260   return opt_result::success ();
2261
2262 again:
2263   /* Ensure that "ok" is false (with an opt_problem if dumping is enabled).  */
2264   gcc_assert (!ok);
2265
2266   /* Try again with SLP forced off but if we didn't do any SLP there is
2267      no point in re-trying.  */
2268   if (!slp)
2269     return ok;
2270
2271   /* If there are reduction chains re-trying will fail anyway.  */
2272   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2273     return ok;
2274
2275   /* Likewise if the grouped loads or stores in the SLP cannot be handled
2276      via interleaving or lane instructions.  */
2277   slp_instance instance;
2278   slp_tree node;
2279   unsigned i, j;
2280   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2281     {
2282       stmt_vec_info vinfo;
2283       vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2284       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2285         continue;
2286       vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2287       unsigned int size = DR_GROUP_SIZE (vinfo);
2288       tree vectype = STMT_VINFO_VECTYPE (vinfo);
2289       if (! vect_store_lanes_supported (vectype, size, false)
2290          && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2291          && ! vect_grouped_store_supported (vectype, size))
2292         return opt_result::failure_at (vinfo->stmt,
2293                                        "unsupported grouped store\n");
2294       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2295         {
2296           vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
2297           vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2298           bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2299           size = DR_GROUP_SIZE (vinfo);
2300           vectype = STMT_VINFO_VECTYPE (vinfo);
2301           if (! vect_load_lanes_supported (vectype, size, false)
2302               && ! vect_grouped_load_supported (vectype, single_element_p,
2303                                                 size))
2304             return opt_result::failure_at (vinfo->stmt,
2305                                            "unsupported grouped load\n");
2306         }
2307     }
2308
2309   if (dump_enabled_p ())
2310     dump_printf_loc (MSG_NOTE, vect_location,
2311                      "re-trying with SLP disabled\n");
2312
2313   /* Roll back state appropriately.  No SLP this time.  */
2314   slp = false;
2315   /* Restore vectorization factor as it were without SLP.  */
2316   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2317   /* Free the SLP instances.  */
2318   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2319     vect_free_slp_instance (instance, false);
2320   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2321   /* Reset SLP type to loop_vect on all stmts.  */
2322   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2323     {
2324       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2325       for (gimple_stmt_iterator si = gsi_start_phis (bb);
2326            !gsi_end_p (si); gsi_next (&si))
2327         {
2328           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2329           STMT_SLP_TYPE (stmt_info) = loop_vect;
2330           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2331               || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
2332             {
2333               /* vectorizable_reduction adjusts reduction stmt def-types,
2334                  restore them to that of the PHI.  */
2335               STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
2336                 = STMT_VINFO_DEF_TYPE (stmt_info);
2337               STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
2338                                         (STMT_VINFO_REDUC_DEF (stmt_info)))
2339                 = STMT_VINFO_DEF_TYPE (stmt_info);
2340             }
2341         }
2342       for (gimple_stmt_iterator si = gsi_start_bb (bb);
2343            !gsi_end_p (si); gsi_next (&si))
2344         {
2345           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2346           STMT_SLP_TYPE (stmt_info) = loop_vect;
2347           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2348             {
2349               gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2350               stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
2351               STMT_SLP_TYPE (stmt_info) = loop_vect;
2352               for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2353                    !gsi_end_p (pi); gsi_next (&pi))
2354                 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2355                   = loop_vect;
2356             }
2357         }
2358     }
2359   /* Free optimized alias test DDRS.  */
2360   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2361   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2362   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2363   /* Reset target cost data.  */
2364   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2365   LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2366     = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2367   /* Reset accumulated rgroup information.  */
2368   release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo));
2369   /* Reset assorted flags.  */
2370   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2371   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2372   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2373   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2374   LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p;
2375
2376   goto start_over;
2377 }
2378
2379 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
2380    to be better than vectorizing it using OLD_LOOP_VINFO.  Assume that
2381    OLD_LOOP_VINFO is better unless something specifically indicates
2382    otherwise.
2383
2384    Note that this deliberately isn't a partial order.  */
2385
2386 static bool
2387 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
2388                           loop_vec_info old_loop_vinfo)
2389 {
2390   struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
2391   gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
2392
2393   poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
2394   poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
2395
2396   /* Always prefer a VF of loop->simdlen over any other VF.  */
2397   if (loop->simdlen)
2398     {
2399       bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
2400       bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
2401       if (new_simdlen_p != old_simdlen_p)
2402         return new_simdlen_p;
2403     }
2404
2405   /* Limit the VFs to what is likely to be the maximum number of iterations,
2406      to handle cases in which at least one loop_vinfo is fully-masked.  */
2407   HOST_WIDE_INT estimated_max_niter = likely_max_stmt_executions_int (loop);
2408   if (estimated_max_niter != -1)
2409     {
2410       if (known_le (estimated_max_niter, new_vf))
2411         new_vf = estimated_max_niter;
2412       if (known_le (estimated_max_niter, old_vf))
2413         old_vf = estimated_max_niter;
2414     }
2415
2416   /* Check whether the (fractional) cost per scalar iteration is lower
2417      or higher: new_inside_cost / new_vf vs. old_inside_cost / old_vf.  */
2418   poly_widest_int rel_new = (new_loop_vinfo->vec_inside_cost
2419                              * poly_widest_int (old_vf));
2420   poly_widest_int rel_old = (old_loop_vinfo->vec_inside_cost
2421                              * poly_widest_int (new_vf));
2422   if (maybe_lt (rel_old, rel_new))
2423     {
2424       /* When old_loop_vinfo uses a variable vectorization factor,
2425          we know that it has a lower cost for at least one runtime VF.
2426          However, we don't know how likely that VF is.
2427
2428          One option would be to compare the costs for the estimated VFs.
2429          The problem is that that can put too much pressure on the cost
2430          model.  E.g. if the estimated VF is also the lowest possible VF,
2431          and if old_loop_vinfo is 1 unit worse than new_loop_vinfo
2432          for the estimated VF, we'd then choose new_loop_vinfo even
2433          though (a) new_loop_vinfo might not actually be better than
2434          old_loop_vinfo for that VF and (b) it would be significantly
2435          worse at larger VFs.
2436
2437          Here we go for a hacky compromise: pick new_loop_vinfo if it is
2438          no more expensive than old_loop_vinfo even after doubling the
2439          estimated old_loop_vinfo VF.  For all but trivial loops, this
2440          ensures that we only pick new_loop_vinfo if it is significantly
2441          better than old_loop_vinfo at the estimated VF.  */
2442       if (rel_new.is_constant ())
2443         return false;
2444
2445       HOST_WIDE_INT new_estimated_vf = estimated_poly_value (new_vf);
2446       HOST_WIDE_INT old_estimated_vf = estimated_poly_value (old_vf);
2447       widest_int estimated_rel_new = (new_loop_vinfo->vec_inside_cost
2448                                       * widest_int (old_estimated_vf));
2449       widest_int estimated_rel_old = (old_loop_vinfo->vec_inside_cost
2450                                       * widest_int (new_estimated_vf));
2451       return estimated_rel_new * 2 <= estimated_rel_old;
2452     }
2453   if (known_lt (rel_new, rel_old))
2454     return true;
2455
2456   /* If there's nothing to choose between the loop bodies, see whether
2457      there's a difference in the prologue and epilogue costs.  */
2458   if (new_loop_vinfo->vec_outside_cost != old_loop_vinfo->vec_outside_cost)
2459     return new_loop_vinfo->vec_outside_cost < old_loop_vinfo->vec_outside_cost;
2460
2461   return false;
2462 }
2463
2464 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO.  Return
2465    true if we should.  */
2466
2467 static bool
2468 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
2469                         loop_vec_info old_loop_vinfo)
2470 {
2471   if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
2472     return false;
2473
2474   if (dump_enabled_p ())
2475     dump_printf_loc (MSG_NOTE, vect_location,
2476                      "***** Preferring vector mode %s to vector mode %s\n",
2477                      GET_MODE_NAME (new_loop_vinfo->vector_mode),
2478                      GET_MODE_NAME (old_loop_vinfo->vector_mode));
2479   return true;
2480 }
2481
2482 /* Function vect_analyze_loop.
2483
2484    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2485    for it.  The different analyses will record information in the
2486    loop_vec_info struct.  */
2487 opt_loop_vec_info
2488 vect_analyze_loop (class loop *loop, vec_info_shared *shared)
2489 {
2490   auto_vector_modes vector_modes;
2491
2492   /* Autodetect first vector size we try.  */
2493   unsigned int autovec_flags
2494     = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
2495                                                     loop->simdlen != 0);
2496   unsigned int mode_i = 0;
2497
2498   DUMP_VECT_SCOPE ("analyze_loop_nest");
2499
2500   if (loop_outer (loop)
2501       && loop_vec_info_for_loop (loop_outer (loop))
2502       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2503     return opt_loop_vec_info::failure_at (vect_location,
2504                                           "outer-loop already vectorized.\n");
2505
2506   if (!find_loop_nest (loop, &shared->loop_nest))
2507     return opt_loop_vec_info::failure_at
2508       (vect_location,
2509        "not vectorized: loop nest containing two or more consecutive inner"
2510        " loops cannot be vectorized\n");
2511
2512   unsigned n_stmts = 0;
2513   machine_mode autodetected_vector_mode = VOIDmode;
2514   opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
2515   machine_mode next_vector_mode = VOIDmode;
2516   poly_uint64 lowest_th = 0;
2517   unsigned vectorized_loops = 0;
2518   bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
2519                              && !unlimited_cost_model (loop));
2520
2521   bool vect_epilogues = false;
2522   opt_result res = opt_result::success ();
2523   unsigned HOST_WIDE_INT simdlen = loop->simdlen;
2524   while (1)
2525     {
2526       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
2527       opt_loop_vec_info loop_vinfo = vect_analyze_loop_form (loop, shared);
2528       if (!loop_vinfo)
2529         {
2530           if (dump_enabled_p ())
2531             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2532                              "bad loop form.\n");
2533           gcc_checking_assert (first_loop_vinfo == NULL);
2534           return loop_vinfo;
2535         }
2536       loop_vinfo->vector_mode = next_vector_mode;
2537
2538       bool fatal = false;
2539
2540       /* When pick_lowest_cost_p is true, we should in principle iterate
2541          over all the loop_vec_infos that LOOP_VINFO could replace and
2542          try to vectorize LOOP_VINFO under the same conditions.
2543          E.g. when trying to replace an epilogue loop, we should vectorize
2544          LOOP_VINFO as an epilogue loop with the same VF limit.  When trying
2545          to replace the main loop, we should vectorize LOOP_VINFO as a main
2546          loop too.
2547
2548          However, autovectorize_vector_modes is usually sorted as follows:
2549
2550          - Modes that naturally produce lower VFs usually follow modes that
2551            naturally produce higher VFs.
2552
2553          - When modes naturally produce the same VF, maskable modes
2554            usually follow unmaskable ones, so that the maskable mode
2555            can be used to vectorize the epilogue of the unmaskable mode.
2556
2557          This order is preferred because it leads to the maximum
2558          epilogue vectorization opportunities.  Targets should only use
2559          a different order if they want to make wide modes available while
2560          disparaging them relative to earlier, smaller modes.  The assumption
2561          in that case is that the wider modes are more expensive in some
2562          way that isn't reflected directly in the costs.
2563
2564          There should therefore be few interesting cases in which
2565          LOOP_VINFO fails when treated as an epilogue loop, succeeds when
2566          treated as a standalone loop, and ends up being genuinely cheaper
2567          than FIRST_LOOP_VINFO.  */
2568       if (vect_epilogues)
2569         LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = first_loop_vinfo;
2570
2571       res = vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts);
2572       if (mode_i == 0)
2573         autodetected_vector_mode = loop_vinfo->vector_mode;
2574       if (dump_enabled_p ())
2575         {
2576           if (res)
2577             dump_printf_loc (MSG_NOTE, vect_location,
2578                              "***** Analysis succeeded with vector mode %s\n",
2579                              GET_MODE_NAME (loop_vinfo->vector_mode));
2580           else
2581             dump_printf_loc (MSG_NOTE, vect_location,
2582                              "***** Analysis failed with vector mode %s\n",
2583                              GET_MODE_NAME (loop_vinfo->vector_mode));
2584         }
2585
2586       loop->aux = NULL;
2587
2588       if (!fatal)
2589         while (mode_i < vector_modes.length ()
2590                && vect_chooses_same_modes_p (loop_vinfo, vector_modes[mode_i]))
2591           {
2592             if (dump_enabled_p ())
2593               dump_printf_loc (MSG_NOTE, vect_location,
2594                                "***** The result for vector mode %s would"
2595                                " be the same\n",
2596                                GET_MODE_NAME (vector_modes[mode_i]));
2597             mode_i += 1;
2598           }
2599
2600       if (res)
2601         {
2602           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2603           vectorized_loops++;
2604
2605           /* Once we hit the desired simdlen for the first time,
2606              discard any previous attempts.  */
2607           if (simdlen
2608               && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
2609             {
2610               delete first_loop_vinfo;
2611               first_loop_vinfo = opt_loop_vec_info::success (NULL);
2612               LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = NULL;
2613               simdlen = 0;
2614             }
2615           else if (pick_lowest_cost_p && first_loop_vinfo)
2616             {
2617               /* Keep trying to roll back vectorization attempts while the
2618                  loop_vec_infos they produced were worse than this one.  */
2619               vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
2620               while (!vinfos.is_empty ()
2621                      && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
2622                 {
2623                   gcc_assert (vect_epilogues);
2624                   delete vinfos.pop ();
2625                 }
2626               if (vinfos.is_empty ()
2627                   && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
2628                 {
2629                   delete first_loop_vinfo;
2630                   first_loop_vinfo = opt_loop_vec_info::success (NULL);
2631                   LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = NULL;
2632                 }
2633             }
2634
2635           if (first_loop_vinfo == NULL)
2636             {
2637               first_loop_vinfo = loop_vinfo;
2638               lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
2639             }
2640           else if (vect_epilogues
2641                    /* For now only allow one epilogue loop.  */
2642                    && first_loop_vinfo->epilogue_vinfos.is_empty ())
2643             {
2644               first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
2645               poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
2646               gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2647                           || maybe_ne (lowest_th, 0U));
2648               /* Keep track of the known smallest versioning
2649                  threshold.  */
2650               if (ordered_p (lowest_th, th))
2651                 lowest_th = ordered_min (lowest_th, th);
2652             }
2653           else
2654             delete loop_vinfo;
2655
2656           /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
2657              enabled, SIMDUID is not set, it is the innermost loop and we have
2658              either already found the loop's SIMDLEN or there was no SIMDLEN to
2659              begin with.
2660              TODO: Enable epilogue vectorization for loops with SIMDUID set.  */
2661           vect_epilogues = (!simdlen
2662                             && loop->inner == NULL
2663                             && param_vect_epilogues_nomask
2664                             && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
2665                             && !loop->simduid
2666                             /* For now only allow one epilogue loop, but allow
2667                                pick_lowest_cost_p to replace it.  */
2668                             && (first_loop_vinfo->epilogue_vinfos.is_empty ()
2669                                 || pick_lowest_cost_p));
2670
2671           /* Commit to first_loop_vinfo if we have no reason to try
2672              alternatives.  */
2673           if (!simdlen && !vect_epilogues && !pick_lowest_cost_p)
2674             break;
2675         }
2676       else
2677         {
2678           delete loop_vinfo;
2679           if (fatal)
2680             {
2681               gcc_checking_assert (first_loop_vinfo == NULL);
2682               break;
2683             }
2684         }
2685
2686       if (mode_i < vector_modes.length ()
2687           && VECTOR_MODE_P (autodetected_vector_mode)
2688           && (related_vector_mode (vector_modes[mode_i],
2689                                    GET_MODE_INNER (autodetected_vector_mode))
2690               == autodetected_vector_mode)
2691           && (related_vector_mode (autodetected_vector_mode,
2692                                    GET_MODE_INNER (vector_modes[mode_i]))
2693               == vector_modes[mode_i]))
2694         {
2695           if (dump_enabled_p ())
2696             dump_printf_loc (MSG_NOTE, vect_location,
2697                              "***** Skipping vector mode %s, which would"
2698                              " repeat the analysis for %s\n",
2699                              GET_MODE_NAME (vector_modes[mode_i]),
2700                              GET_MODE_NAME (autodetected_vector_mode));
2701           mode_i += 1;
2702         }
2703
2704       if (mode_i == vector_modes.length ()
2705           || autodetected_vector_mode == VOIDmode)
2706         break;
2707
2708       /* Try the next biggest vector size.  */
2709       next_vector_mode = vector_modes[mode_i++];
2710       if (dump_enabled_p ())
2711         dump_printf_loc (MSG_NOTE, vect_location,
2712                          "***** Re-trying analysis with vector mode %s\n",
2713                          GET_MODE_NAME (next_vector_mode));
2714     }
2715
2716   if (first_loop_vinfo)
2717     {
2718       loop->aux = (loop_vec_info) first_loop_vinfo;
2719       if (dump_enabled_p ())
2720         dump_printf_loc (MSG_NOTE, vect_location,
2721                          "***** Choosing vector mode %s\n",
2722                          GET_MODE_NAME (first_loop_vinfo->vector_mode));
2723       LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
2724       return first_loop_vinfo;
2725     }
2726
2727   return opt_loop_vec_info::propagate_failure (res);
2728 }
2729
2730 /* Return true if there is an in-order reduction function for CODE, storing
2731    it in *REDUC_FN if so.  */
2732
2733 static bool
2734 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
2735 {
2736   switch (code)
2737     {
2738     case PLUS_EXPR:
2739       *reduc_fn = IFN_FOLD_LEFT_PLUS;
2740       return true;
2741
2742     default:
2743       return false;
2744     }
2745 }
2746
2747 /* Function reduction_fn_for_scalar_code
2748
2749    Input:
2750    CODE - tree_code of a reduction operations.
2751
2752    Output:
2753    REDUC_FN - the corresponding internal function to be used to reduce the
2754       vector of partial results into a single scalar result, or IFN_LAST
2755       if the operation is a supported reduction operation, but does not have
2756       such an internal function.
2757
2758    Return FALSE if CODE currently cannot be vectorized as reduction.  */
2759
2760 static bool
2761 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
2762 {
2763   switch (code)
2764     {
2765       case MAX_EXPR:
2766         *reduc_fn = IFN_REDUC_MAX;
2767         return true;
2768
2769       case MIN_EXPR:
2770         *reduc_fn = IFN_REDUC_MIN;
2771         return true;
2772
2773       case PLUS_EXPR:
2774         *reduc_fn = IFN_REDUC_PLUS;
2775         return true;
2776
2777       case BIT_AND_EXPR:
2778         *reduc_fn = IFN_REDUC_AND;
2779         return true;
2780
2781       case BIT_IOR_EXPR:
2782         *reduc_fn = IFN_REDUC_IOR;
2783         return true;
2784
2785       case BIT_XOR_EXPR:
2786         *reduc_fn = IFN_REDUC_XOR;
2787         return true;
2788
2789       case MULT_EXPR:
2790       case MINUS_EXPR:
2791         *reduc_fn = IFN_LAST;
2792         return true;
2793
2794       default:
2795        return false;
2796     }
2797 }
2798
2799 /* If there is a neutral value X such that SLP reduction NODE would not
2800    be affected by the introduction of additional X elements, return that X,
2801    otherwise return null.  CODE is the code of the reduction and VECTOR_TYPE
2802    is the vector type that would hold element X.  REDUC_CHAIN is true if
2803    the SLP statements perform a single reduction, false if each statement
2804    performs an independent reduction.  */
2805
2806 static tree
2807 neutral_op_for_slp_reduction (slp_tree slp_node, tree vector_type,
2808                               tree_code code, bool reduc_chain)
2809 {
2810   vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2811   stmt_vec_info stmt_vinfo = stmts[0];
2812   tree scalar_type = TREE_TYPE (vector_type);
2813   class loop *loop = gimple_bb (stmt_vinfo->stmt)->loop_father;
2814   gcc_assert (loop);
2815
2816   switch (code)
2817     {
2818     case WIDEN_SUM_EXPR:
2819     case DOT_PROD_EXPR:
2820     case SAD_EXPR:
2821     case PLUS_EXPR:
2822     case MINUS_EXPR:
2823     case BIT_IOR_EXPR:
2824     case BIT_XOR_EXPR:
2825       return build_zero_cst (scalar_type);
2826
2827     case MULT_EXPR:
2828       return build_one_cst (scalar_type);
2829
2830     case BIT_AND_EXPR:
2831       return build_all_ones_cst (scalar_type);
2832
2833     case MAX_EXPR:
2834     case MIN_EXPR:
2835       /* For MIN/MAX the initial values are neutral.  A reduction chain
2836          has only a single initial value, so that value is neutral for
2837          all statements.  */
2838       if (reduc_chain)
2839         return PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt,
2840                                       loop_preheader_edge (loop));
2841       return NULL_TREE;
2842
2843     default:
2844       return NULL_TREE;
2845     }
2846 }
2847
2848 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
2849    STMT is printed with a message MSG. */
2850
2851 static void
2852 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2853 {
2854   dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
2855 }
2856
2857 /* Return true if we need an in-order reduction for operation CODE
2858    on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
2859    overflow must wrap.  */
2860
2861 bool
2862 needs_fold_left_reduction_p (tree type, tree_code code)
2863 {
2864   /* CHECKME: check for !flag_finite_math_only too?  */
2865   if (SCALAR_FLOAT_TYPE_P (type))
2866     switch (code)
2867       {
2868       case MIN_EXPR:
2869       case MAX_EXPR:
2870         return false;
2871
2872       default:
2873         return !flag_associative_math;
2874       }
2875
2876   if (INTEGRAL_TYPE_P (type))
2877     {
2878       if (!operation_no_trapping_overflow (type, code))
2879         return true;
2880       return false;
2881     }
2882
2883   if (SAT_FIXED_POINT_TYPE_P (type))
2884     return true;
2885
2886   return false;
2887 }
2888
2889 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
2890    has a handled computation expression.  Store the main reduction
2891    operation in *CODE.  */
2892
2893 static bool
2894 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
2895                       tree loop_arg, enum tree_code *code,
2896                       vec<std::pair<ssa_op_iter, use_operand_p> > &path)
2897 {
2898   auto_bitmap visited;
2899   tree lookfor = PHI_RESULT (phi);
2900   ssa_op_iter curri;
2901   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
2902   while (USE_FROM_PTR (curr) != loop_arg)
2903     curr = op_iter_next_use (&curri);
2904   curri.i = curri.numops;
2905   do
2906     {
2907       path.safe_push (std::make_pair (curri, curr));
2908       tree use = USE_FROM_PTR (curr);
2909       if (use == lookfor)
2910         break;
2911       gimple *def = SSA_NAME_DEF_STMT (use);
2912       if (gimple_nop_p (def)
2913           || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
2914         {
2915 pop:
2916           do
2917             {
2918               std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
2919               curri = x.first;
2920               curr = x.second;
2921               do
2922                 curr = op_iter_next_use (&curri);
2923               /* Skip already visited or non-SSA operands (from iterating
2924                  over PHI args).  */
2925               while (curr != NULL_USE_OPERAND_P
2926                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2927                          || ! bitmap_set_bit (visited,
2928                                               SSA_NAME_VERSION
2929                                                 (USE_FROM_PTR (curr)))));
2930             }
2931           while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
2932           if (curr == NULL_USE_OPERAND_P)
2933             break;
2934         }
2935       else
2936         {
2937           if (gimple_code (def) == GIMPLE_PHI)
2938             curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
2939           else
2940             curr = op_iter_init_use (&curri, def, SSA_OP_USE);
2941           while (curr != NULL_USE_OPERAND_P
2942                  && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2943                      || ! bitmap_set_bit (visited,
2944                                           SSA_NAME_VERSION
2945                                             (USE_FROM_PTR (curr)))))
2946             curr = op_iter_next_use (&curri);
2947           if (curr == NULL_USE_OPERAND_P)
2948             goto pop;
2949         }
2950     }
2951   while (1);
2952   if (dump_file && (dump_flags & TDF_DETAILS))
2953     {
2954       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
2955       unsigned i;
2956       std::pair<ssa_op_iter, use_operand_p> *x;
2957       FOR_EACH_VEC_ELT (path, i, x)
2958         dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
2959       dump_printf (MSG_NOTE, "\n");
2960     }
2961
2962   /* Check whether the reduction path detected is valid.  */
2963   bool fail = path.length () == 0;
2964   bool neg = false;
2965   int sign = -1;
2966   *code = ERROR_MARK;
2967   for (unsigned i = 1; i < path.length (); ++i)
2968     {
2969       gimple *use_stmt = USE_STMT (path[i].second);
2970       tree op = USE_FROM_PTR (path[i].second);
2971       if (! is_gimple_assign (use_stmt)
2972           /* The following make sure we can compute the operand index
2973              easily plus it mostly disallows chaining via COND_EXPR condition
2974              operands.  */
2975           || (gimple_assign_rhs1_ptr (use_stmt) != path[i].second->use
2976               && (gimple_num_ops (use_stmt) <= 2
2977                   || gimple_assign_rhs2_ptr (use_stmt) != path[i].second->use)
2978               && (gimple_num_ops (use_stmt) <= 3
2979                   || gimple_assign_rhs3_ptr (use_stmt) != path[i].second->use)))
2980         {
2981           fail = true;
2982           break;
2983         }
2984       /* Check there's only a single stmt the op is used on inside
2985          of the loop.  */
2986       imm_use_iterator imm_iter;
2987       gimple *op_use_stmt;
2988       unsigned cnt = 0;
2989       FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op)
2990         if (!is_gimple_debug (op_use_stmt)
2991             && flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt)))
2992           {
2993             /* We want to allow x + x but not x < 1 ? x : 2.  */
2994             if (is_gimple_assign (op_use_stmt)
2995                 && gimple_assign_rhs_code (op_use_stmt) == COND_EXPR)
2996               {
2997                 use_operand_p use_p;
2998                 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
2999                   cnt++;
3000               }
3001             else
3002               cnt++;
3003           }
3004       if (cnt != 1)
3005         {
3006           fail = true;
3007           break;
3008         }
3009       tree_code use_code = gimple_assign_rhs_code (use_stmt);
3010       if (use_code == MINUS_EXPR)
3011         {
3012           use_code = PLUS_EXPR;
3013           /* Track whether we negate the reduction value each iteration.  */
3014           if (gimple_assign_rhs2 (use_stmt) == op)
3015             neg = ! neg;
3016         }
3017       if (CONVERT_EXPR_CODE_P (use_code)
3018           && tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (use_stmt)),
3019                                     TREE_TYPE (gimple_assign_rhs1 (use_stmt))))
3020         ;
3021       else if (*code == ERROR_MARK)
3022         {
3023           *code = use_code;
3024           sign = TYPE_SIGN (TREE_TYPE (gimple_assign_lhs (use_stmt)));
3025         }
3026       else if (use_code != *code)
3027         {
3028           fail = true;
3029           break;
3030         }
3031       else if ((use_code == MIN_EXPR
3032                 || use_code == MAX_EXPR)
3033                && sign != TYPE_SIGN (TREE_TYPE (gimple_assign_lhs (use_stmt))))
3034         {
3035           fail = true;
3036           break;
3037         }
3038     }
3039   return ! fail && ! neg && *code != ERROR_MARK;
3040 }
3041
3042 bool
3043 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3044                       tree loop_arg, enum tree_code code)
3045 {
3046   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3047   enum tree_code code_;
3048   return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
3049           && code_ == code);
3050 }
3051
3052
3053
3054 /* Function vect_is_simple_reduction
3055
3056    (1) Detect a cross-iteration def-use cycle that represents a simple
3057    reduction computation.  We look for the following pattern:
3058
3059    loop_header:
3060      a1 = phi < a0, a2 >
3061      a3 = ...
3062      a2 = operation (a3, a1)
3063
3064    or
3065
3066    a3 = ...
3067    loop_header:
3068      a1 = phi < a0, a2 >
3069      a2 = operation (a3, a1)
3070
3071    such that:
3072    1. operation is commutative and associative and it is safe to
3073       change the order of the computation
3074    2. no uses for a2 in the loop (a2 is used out of the loop)
3075    3. no uses of a1 in the loop besides the reduction operation
3076    4. no uses of a1 outside the loop.
3077
3078    Conditions 1,4 are tested here.
3079    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
3080
3081    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
3082    nested cycles.
3083
3084    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
3085    reductions:
3086
3087      a1 = phi < a0, a2 >
3088      inner loop (def of a3)
3089      a2 = phi < a3 >
3090
3091    (4) Detect condition expressions, ie:
3092      for (int i = 0; i < N; i++)
3093        if (a[i] < val)
3094         ret_val = a[i];
3095
3096 */
3097
3098 static stmt_vec_info
3099 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3100                           bool *double_reduc, bool *reduc_chain_p)
3101 {
3102   gphi *phi = as_a <gphi *> (phi_info->stmt);
3103   gimple *phi_use_stmt = NULL;
3104   imm_use_iterator imm_iter;
3105   use_operand_p use_p;
3106
3107   *double_reduc = false;
3108   *reduc_chain_p = false;
3109   STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
3110
3111   tree phi_name = PHI_RESULT (phi);
3112   /* ???  If there are no uses of the PHI result the inner loop reduction
3113      won't be detected as possibly double-reduction by vectorizable_reduction
3114      because that tries to walk the PHI arg from the preheader edge which
3115      can be constant.  See PR60382.  */
3116   if (has_zero_uses (phi_name))
3117     return NULL;
3118   class loop *loop = (gimple_bb (phi))->loop_father;
3119   unsigned nphi_def_loop_uses = 0;
3120   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
3121     {
3122       gimple *use_stmt = USE_STMT (use_p);
3123       if (is_gimple_debug (use_stmt))
3124         continue;
3125
3126       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3127         {
3128           if (dump_enabled_p ())
3129             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3130                              "intermediate value used outside loop.\n");
3131
3132           return NULL;
3133         }
3134
3135       nphi_def_loop_uses++;
3136       phi_use_stmt = use_stmt;
3137     }
3138
3139   tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
3140   if (TREE_CODE (latch_def) != SSA_NAME)
3141     {
3142       if (dump_enabled_p ())
3143         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3144                          "reduction: not ssa_name: %T\n", latch_def);
3145       return NULL;
3146     }
3147
3148   stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
3149   if (!def_stmt_info
3150       || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
3151     return NULL;
3152
3153   bool nested_in_vect_loop
3154     = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
3155   unsigned nlatch_def_loop_uses = 0;
3156   auto_vec<gphi *, 3> lcphis;
3157   bool inner_loop_of_double_reduc = false;
3158   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
3159     {
3160       gimple *use_stmt = USE_STMT (use_p);
3161       if (is_gimple_debug (use_stmt))
3162         continue;
3163       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3164         nlatch_def_loop_uses++;
3165       else
3166         {
3167           /* We can have more than one loop-closed PHI.  */
3168           lcphis.safe_push (as_a <gphi *> (use_stmt));
3169           if (nested_in_vect_loop
3170               && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
3171                   == vect_double_reduction_def))
3172             inner_loop_of_double_reduc = true;
3173         }
3174     }
3175
3176   /* If we are vectorizing an inner reduction we are executing that
3177      in the original order only in case we are not dealing with a
3178      double reduction.  */
3179   if (nested_in_vect_loop && !inner_loop_of_double_reduc)
3180     {
3181       if (dump_enabled_p ())
3182         report_vect_op (MSG_NOTE, def_stmt_info->stmt,
3183                         "detected nested cycle: ");
3184       return def_stmt_info;
3185     }
3186
3187   /* If this isn't a nested cycle or if the nested cycle reduction value
3188      is used ouside of the inner loop we cannot handle uses of the reduction
3189      value.  */
3190   if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
3191     {
3192       if (dump_enabled_p ())
3193         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3194                          "reduction used in loop.\n");
3195       return NULL;
3196     }
3197
3198   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3199      defined in the inner loop.  */
3200   if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
3201     {
3202       tree op1 = PHI_ARG_DEF (def_stmt, 0);
3203       if (gimple_phi_num_args (def_stmt) != 1
3204           || TREE_CODE (op1) != SSA_NAME)
3205         {
3206           if (dump_enabled_p ())
3207             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3208                              "unsupported phi node definition.\n");
3209
3210           return NULL;
3211         }
3212
3213       gimple *def1 = SSA_NAME_DEF_STMT (op1);
3214       if (gimple_bb (def1)
3215           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3216           && loop->inner
3217           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3218           && is_gimple_assign (def1)
3219           && is_a <gphi *> (phi_use_stmt)
3220           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3221         {
3222           if (dump_enabled_p ())
3223             report_vect_op (MSG_NOTE, def_stmt,
3224                             "detected double reduction: ");
3225
3226           *double_reduc = true;
3227           return def_stmt_info;
3228         }
3229
3230       return NULL;
3231     }
3232
3233   /* Look for the expression computing latch_def from then loop PHI result.  */
3234   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3235   enum tree_code code;
3236   if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
3237                             path))
3238     {
3239       STMT_VINFO_REDUC_CODE (phi_info) = code;
3240       if (code == COND_EXPR && !nested_in_vect_loop)
3241         STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
3242
3243       /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
3244          reduction chain for which the additional restriction is that
3245          all operations in the chain are the same.  */
3246       auto_vec<stmt_vec_info, 8> reduc_chain;
3247       unsigned i;
3248       bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
3249       for (i = path.length () - 1; i >= 1; --i)
3250         {
3251           gimple *stmt = USE_STMT (path[i].second);
3252           stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
3253           STMT_VINFO_REDUC_IDX (stmt_info)
3254             = path[i].second->use - gimple_assign_rhs1_ptr (stmt);
3255           enum tree_code stmt_code = gimple_assign_rhs_code (stmt);
3256           bool leading_conversion = (CONVERT_EXPR_CODE_P (stmt_code)
3257                                      && (i == 1 || i == path.length () - 1));
3258           if ((stmt_code != code && !leading_conversion)
3259               /* We can only handle the final value in epilogue
3260                  generation for reduction chains.  */
3261               || (i != 1 && !has_single_use (gimple_assign_lhs (stmt))))
3262             is_slp_reduc = false;
3263           /* For reduction chains we support a trailing/leading
3264              conversions.  We do not store those in the actual chain.  */
3265           if (leading_conversion)
3266             continue;
3267           reduc_chain.safe_push (stmt_info);
3268         }
3269       if (is_slp_reduc && reduc_chain.length () > 1)
3270         {
3271           for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
3272             {
3273               REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
3274               REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
3275             }
3276           REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
3277           REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
3278
3279           /* Save the chain for further analysis in SLP detection.  */
3280           LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
3281           REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
3282
3283           *reduc_chain_p = true;
3284           if (dump_enabled_p ())
3285             dump_printf_loc (MSG_NOTE, vect_location,
3286                             "reduction: detected reduction chain\n");
3287         }
3288       else if (dump_enabled_p ())
3289         dump_printf_loc (MSG_NOTE, vect_location,
3290                          "reduction: detected reduction\n");
3291
3292       return def_stmt_info;
3293     }
3294
3295   if (dump_enabled_p ())
3296     dump_printf_loc (MSG_NOTE, vect_location,
3297                      "reduction: unknown pattern\n");
3298
3299   return NULL;
3300 }
3301
3302 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
3303 int
3304 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3305                              int *peel_iters_epilogue,
3306                              stmt_vector_for_cost *scalar_cost_vec,
3307                              stmt_vector_for_cost *prologue_cost_vec,
3308                              stmt_vector_for_cost *epilogue_cost_vec)
3309 {
3310   int retval = 0;
3311   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3312
3313   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3314     {
3315       *peel_iters_epilogue = assumed_vf / 2;
3316       if (dump_enabled_p ())
3317         dump_printf_loc (MSG_NOTE, vect_location,
3318                          "cost model: epilogue peel iters set to vf/2 "
3319                          "because loop iterations are unknown .\n");
3320
3321       /* If peeled iterations are known but number of scalar loop
3322          iterations are unknown, count a taken branch per peeled loop.  */
3323       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3324                                  NULL, 0, vect_prologue);
3325       retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
3326                                   NULL, 0, vect_epilogue);
3327     }
3328   else
3329     {
3330       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3331       peel_iters_prologue = niters < peel_iters_prologue ?
3332                             niters : peel_iters_prologue;
3333       *peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3334       /* If we need to peel for gaps, but no peeling is required, we have to
3335          peel VF iterations.  */
3336       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3337         *peel_iters_epilogue = assumed_vf;
3338     }
3339
3340   stmt_info_for_cost *si;
3341   int j;
3342   if (peel_iters_prologue)
3343     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3344       retval += record_stmt_cost (prologue_cost_vec,
3345                                   si->count * peel_iters_prologue,
3346                                   si->kind, si->stmt_info, si->misalign,
3347                                   vect_prologue);
3348   if (*peel_iters_epilogue)
3349     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3350       retval += record_stmt_cost (epilogue_cost_vec,
3351                                   si->count * *peel_iters_epilogue,
3352                                   si->kind, si->stmt_info, si->misalign,
3353                                   vect_epilogue);
3354
3355   return retval;
3356 }
3357
3358 /* Function vect_estimate_min_profitable_iters
3359
3360    Return the number of iterations required for the vector version of the
3361    loop to be profitable relative to the cost of the scalar version of the
3362    loop.
3363
3364    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3365    of iterations for vectorization.  -1 value means loop vectorization
3366    is not profitable.  This returned value may be used for dynamic
3367    profitability check.
3368
3369    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3370    for static check against estimated number of iterations.  */
3371
3372 static void
3373 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3374                                     int *ret_min_profitable_niters,
3375                                     int *ret_min_profitable_estimate)
3376 {
3377   int min_profitable_iters;
3378   int min_profitable_estimate;
3379   int peel_iters_prologue;
3380   int peel_iters_epilogue;
3381   unsigned vec_inside_cost = 0;
3382   int vec_outside_cost = 0;
3383   unsigned vec_prologue_cost = 0;
3384   unsigned vec_epilogue_cost = 0;
3385   int scalar_single_iter_cost = 0;
3386   int scalar_outside_cost = 0;
3387   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3388   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3389   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3390
3391   /* Cost model disabled.  */
3392   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3393     {
3394       if (dump_enabled_p ())
3395         dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3396       *ret_min_profitable_niters = 0;
3397       *ret_min_profitable_estimate = 0;
3398       return;
3399     }
3400
3401   /* Requires loop versioning tests to handle misalignment.  */
3402   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3403     {
3404       /*  FIXME: Make cost depend on complexity of individual check.  */
3405       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3406       (void) add_stmt_cost (loop_vinfo, target_cost_data, len, vector_stmt,
3407                             NULL, 0, vect_prologue);
3408       if (dump_enabled_p ())
3409         dump_printf (MSG_NOTE,
3410                      "cost model: Adding cost of checks for loop "
3411                      "versioning to treat misalignment.\n");
3412     }
3413
3414   /* Requires loop versioning with alias checks.  */
3415   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3416     {
3417       /*  FIXME: Make cost depend on complexity of individual check.  */
3418       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3419       (void) add_stmt_cost (loop_vinfo, target_cost_data, len, vector_stmt,
3420                             NULL, 0, vect_prologue);
3421       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3422       if (len)
3423         /* Count LEN - 1 ANDs and LEN comparisons.  */
3424         (void) add_stmt_cost (loop_vinfo, target_cost_data, len * 2 - 1,
3425                               scalar_stmt, NULL, 0, vect_prologue);
3426       len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3427       if (len)
3428         {
3429           /* Count LEN - 1 ANDs and LEN comparisons.  */
3430           unsigned int nstmts = len * 2 - 1;
3431           /* +1 for each bias that needs adding.  */
3432           for (unsigned int i = 0; i < len; ++i)
3433             if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3434               nstmts += 1;
3435           (void) add_stmt_cost (loop_vinfo, target_cost_data, nstmts,
3436                                 scalar_stmt, NULL, 0, vect_prologue);
3437         }
3438       if (dump_enabled_p ())
3439         dump_printf (MSG_NOTE,
3440                      "cost model: Adding cost of checks for loop "
3441                      "versioning aliasing.\n");
3442     }
3443
3444   /* Requires loop versioning with niter checks.  */
3445   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3446     {
3447       /*  FIXME: Make cost depend on complexity of individual check.  */
3448       (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, vector_stmt,
3449                             NULL, 0, vect_prologue);
3450       if (dump_enabled_p ())
3451         dump_printf (MSG_NOTE,
3452                      "cost model: Adding cost of checks for loop "
3453                      "versioning niters.\n");
3454     }
3455
3456   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3457     (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
3458                           NULL, 0, vect_prologue);
3459
3460   /* Count statements in scalar loop.  Using this as scalar cost for a single
3461      iteration for now.
3462
3463      TODO: Add outer loop support.
3464
3465      TODO: Consider assigning different costs to different scalar
3466      statements.  */
3467
3468   scalar_single_iter_cost
3469     = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3470
3471   /* Add additional cost for the peeled instructions in prologue and epilogue
3472      loop.  (For fully-masked loops there will be no peeling.)
3473
3474      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3475      at compile-time - we assume it's vf/2 (the worst would be vf-1).
3476
3477      TODO: Build an expression that represents peel_iters for prologue and
3478      epilogue to be used in a run-time test.  */
3479
3480   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3481     {
3482       peel_iters_prologue = 0;
3483       peel_iters_epilogue = 0;
3484
3485       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3486         {
3487           /* We need to peel exactly one iteration.  */
3488           peel_iters_epilogue += 1;
3489           stmt_info_for_cost *si;
3490           int j;
3491           FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
3492                             j, si)
3493             (void) add_stmt_cost (loop_vinfo, target_cost_data, si->count,
3494                                   si->kind, si->stmt_info, si->misalign,
3495                                   vect_epilogue);
3496         }
3497
3498       /* Calculate how many masks we need to generate.  */
3499       unsigned int num_masks = 0;
3500       rgroup_masks *rgm;
3501       unsigned int num_vectors_m1;
3502       FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), num_vectors_m1, rgm)
3503         if (rgm->mask_type)
3504           num_masks += num_vectors_m1 + 1;
3505       gcc_assert (num_masks > 0);
3506
3507       /* In the worst case, we need to generate each mask in the prologue
3508          and in the loop body.  One of the loop body mask instructions
3509          replaces the comparison in the scalar loop, and since we don't
3510          count the scalar comparison against the scalar body, we shouldn't
3511          count that vector instruction against the vector body either.
3512
3513          Sometimes we can use unpacks instead of generating prologue
3514          masks and sometimes the prologue mask will fold to a constant,
3515          so the actual prologue cost might be smaller.  However, it's
3516          simpler and safer to use the worst-case cost; if this ends up
3517          being the tie-breaker between vectorizing or not, then it's
3518          probably better not to vectorize.  */
3519       (void) add_stmt_cost (loop_vinfo,
3520                             target_cost_data, num_masks, vector_stmt,
3521                             NULL, 0, vect_prologue);
3522       (void) add_stmt_cost (loop_vinfo,
3523                             target_cost_data, num_masks - 1, vector_stmt,
3524                             NULL, 0, vect_body);
3525     }
3526   else if (npeel < 0)
3527     {
3528       peel_iters_prologue = assumed_vf / 2;
3529       if (dump_enabled_p ())
3530         dump_printf (MSG_NOTE, "cost model: "
3531                      "prologue peel iters set to vf/2.\n");
3532
3533       /* If peeling for alignment is unknown, loop bound of main loop becomes
3534          unknown.  */
3535       peel_iters_epilogue = assumed_vf / 2;
3536       if (dump_enabled_p ())
3537         dump_printf (MSG_NOTE, "cost model: "
3538                      "epilogue peel iters set to vf/2 because "
3539                      "peeling for alignment is unknown.\n");
3540
3541       /* If peeled iterations are unknown, count a taken branch and a not taken
3542          branch per peeled loop. Even if scalar loop iterations are known,
3543          vector iterations are not known since peeled prologue iterations are
3544          not known. Hence guards remain the same.  */
3545       (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
3546                             NULL, 0, vect_prologue);
3547       (void) add_stmt_cost (loop_vinfo,
3548                             target_cost_data, 1, cond_branch_not_taken,
3549                             NULL, 0, vect_prologue);
3550       (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
3551                             NULL, 0, vect_epilogue);
3552       (void) add_stmt_cost (loop_vinfo,
3553                             target_cost_data, 1, cond_branch_not_taken,
3554                             NULL, 0, vect_epilogue);
3555       stmt_info_for_cost *si;
3556       int j;
3557       FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3558         {
3559           (void) add_stmt_cost (loop_vinfo, target_cost_data,
3560                                 si->count * peel_iters_prologue,
3561                                 si->kind, si->stmt_info, si->misalign,
3562                                 vect_prologue);
3563           (void) add_stmt_cost (loop_vinfo, target_cost_data,
3564                                 si->count * peel_iters_epilogue,
3565                                 si->kind, si->stmt_info, si->misalign,
3566                                 vect_epilogue);
3567         }
3568     }
3569   else
3570     {
3571       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3572       stmt_info_for_cost *si;
3573       int j;
3574       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3575
3576       prologue_cost_vec.create (2);
3577       epilogue_cost_vec.create (2);
3578       peel_iters_prologue = npeel;
3579
3580       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3581                                           &peel_iters_epilogue,
3582                                           &LOOP_VINFO_SCALAR_ITERATION_COST
3583                                             (loop_vinfo),
3584                                           &prologue_cost_vec,
3585                                           &epilogue_cost_vec);
3586
3587       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3588         (void) add_stmt_cost (loop_vinfo,
3589                               data, si->count, si->kind, si->stmt_info,
3590                               si->misalign, vect_prologue);
3591
3592       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3593         (void) add_stmt_cost (loop_vinfo,
3594                               data, si->count, si->kind, si->stmt_info,
3595                               si->misalign, vect_epilogue);
3596
3597       prologue_cost_vec.release ();
3598       epilogue_cost_vec.release ();
3599     }
3600
3601   /* FORNOW: The scalar outside cost is incremented in one of the
3602      following ways:
3603
3604      1. The vectorizer checks for alignment and aliasing and generates
3605      a condition that allows dynamic vectorization.  A cost model
3606      check is ANDED with the versioning condition.  Hence scalar code
3607      path now has the added cost of the versioning check.
3608
3609        if (cost > th & versioning_check)
3610          jmp to vector code
3611
3612      Hence run-time scalar is incremented by not-taken branch cost.
3613
3614      2. The vectorizer then checks if a prologue is required.  If the
3615      cost model check was not done before during versioning, it has to
3616      be done before the prologue check.
3617
3618        if (cost <= th)
3619          prologue = scalar_iters
3620        if (prologue == 0)
3621          jmp to vector code
3622        else
3623          execute prologue
3624        if (prologue == num_iters)
3625          go to exit
3626
3627      Hence the run-time scalar cost is incremented by a taken branch,
3628      plus a not-taken branch, plus a taken branch cost.
3629
3630      3. The vectorizer then checks if an epilogue is required.  If the
3631      cost model check was not done before during prologue check, it
3632      has to be done with the epilogue check.
3633
3634        if (prologue == 0)
3635          jmp to vector code
3636        else
3637          execute prologue
3638        if (prologue == num_iters)
3639          go to exit
3640        vector code:
3641          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3642            jmp to epilogue
3643
3644      Hence the run-time scalar cost should be incremented by 2 taken
3645      branches.
3646
3647      TODO: The back end may reorder the BBS's differently and reverse
3648      conditions/branch directions.  Change the estimates below to
3649      something more reasonable.  */
3650
3651   /* If the number of iterations is known and we do not do versioning, we can
3652      decide whether to vectorize at compile time.  Hence the scalar version
3653      do not carry cost model guard costs.  */
3654   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3655       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3656     {
3657       /* Cost model check occurs at versioning.  */
3658       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3659         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3660       else
3661         {
3662           /* Cost model check occurs at prologue generation.  */
3663           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3664             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3665               + vect_get_stmt_cost (cond_branch_not_taken);
3666           /* Cost model check occurs at epilogue generation.  */
3667           else
3668             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3669         }
3670     }
3671
3672   /* Complete the target-specific cost calculations.  */
3673   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3674                &vec_inside_cost, &vec_epilogue_cost);
3675
3676   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3677
3678   /* Stash the costs so that we can compare two loop_vec_infos.  */
3679   loop_vinfo->vec_inside_cost = vec_inside_cost;
3680   loop_vinfo->vec_outside_cost = vec_outside_cost;
3681
3682   if (dump_enabled_p ())
3683     {
3684       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3685       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
3686                    vec_inside_cost);
3687       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
3688                    vec_prologue_cost);
3689       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
3690                    vec_epilogue_cost);
3691       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
3692                    scalar_single_iter_cost);
3693       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
3694                    scalar_outside_cost);
3695       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
3696                    vec_outside_cost);
3697       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
3698                    peel_iters_prologue);
3699       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
3700                    peel_iters_epilogue);
3701     }
3702
3703   /* Calculate number of iterations required to make the vector version
3704      profitable, relative to the loop bodies only.  The following condition
3705      must hold true:
3706      SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
3707      where
3708      SIC = scalar iteration cost, VIC = vector iteration cost,
3709      VOC = vector outside cost, VF = vectorization factor,
3710      NPEEL = prologue iterations + epilogue iterations,
3711      SOC = scalar outside cost for run time cost model check.  */
3712
3713   int saving_per_viter = (scalar_single_iter_cost * assumed_vf
3714                           - vec_inside_cost);
3715   if (saving_per_viter <= 0)
3716     {
3717       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3718         warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
3719                     "vectorization did not happen for a simd loop");
3720
3721       if (dump_enabled_p ())
3722         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3723                          "cost model: the vector iteration cost = %d "
3724                          "divided by the scalar iteration cost = %d "
3725                          "is greater or equal to the vectorization factor = %d"
3726                          ".\n",
3727                          vec_inside_cost, scalar_single_iter_cost, assumed_vf);
3728       *ret_min_profitable_niters = -1;
3729       *ret_min_profitable_estimate = -1;
3730       return;
3731     }
3732
3733   /* ??? The "if" arm is written to handle all cases; see below for what
3734      we would do for !LOOP_VINFO_FULLY_MASKED_P.  */
3735   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3736     {
3737       /* Rewriting the condition above in terms of the number of
3738          vector iterations (vniters) rather than the number of
3739          scalar iterations (niters) gives:
3740
3741          SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
3742
3743          <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
3744
3745          For integer N, X and Y when X > 0:
3746
3747          N * X > Y <==> N >= (Y /[floor] X) + 1.  */
3748       int outside_overhead = (vec_outside_cost
3749                               - scalar_single_iter_cost * peel_iters_prologue
3750                               - scalar_single_iter_cost * peel_iters_epilogue
3751                               - scalar_outside_cost);
3752       /* We're only interested in cases that require at least one
3753          vector iteration.  */
3754       int min_vec_niters = 1;
3755       if (outside_overhead > 0)
3756         min_vec_niters = outside_overhead / saving_per_viter + 1;
3757
3758       if (dump_enabled_p ())
3759         dump_printf (MSG_NOTE, "  Minimum number of vector iterations: %d\n",
3760                      min_vec_niters);
3761
3762       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3763         {
3764           /* Now that we know the minimum number of vector iterations,
3765              find the minimum niters for which the scalar cost is larger:
3766
3767              SIC * niters > VIC * vniters + VOC - SOC
3768
3769              We know that the minimum niters is no more than
3770              vniters * VF + NPEEL, but it might be (and often is) less
3771              than that if a partial vector iteration is cheaper than the
3772              equivalent scalar code.  */
3773           int threshold = (vec_inside_cost * min_vec_niters
3774                            + vec_outside_cost
3775                            - scalar_outside_cost);
3776           if (threshold <= 0)
3777             min_profitable_iters = 1;
3778           else
3779             min_profitable_iters = threshold / scalar_single_iter_cost + 1;
3780         }
3781       else
3782         /* Convert the number of vector iterations into a number of
3783            scalar iterations.  */
3784         min_profitable_iters = (min_vec_niters * assumed_vf
3785                                 + peel_iters_prologue
3786                                 + peel_iters_epilogue);
3787     }
3788   else
3789     {
3790       min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
3791                               * assumed_vf
3792                               - vec_inside_cost * peel_iters_prologue
3793                               - vec_inside_cost * peel_iters_epilogue);
3794       if (min_profitable_iters <= 0)
3795         min_profitable_iters = 0;
3796       else
3797         {
3798           min_profitable_iters /= saving_per_viter;
3799
3800           if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
3801               <= (((int) vec_inside_cost * min_profitable_iters)
3802                   + (((int) vec_outside_cost - scalar_outside_cost)
3803                      * assumed_vf)))
3804             min_profitable_iters++;
3805         }
3806     }
3807
3808   if (dump_enabled_p ())
3809     dump_printf (MSG_NOTE,
3810                  "  Calculated minimum iters for profitability: %d\n",
3811                  min_profitable_iters);
3812
3813   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
3814       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
3815     /* We want the vectorized loop to execute at least once.  */
3816     min_profitable_iters = assumed_vf + peel_iters_prologue;
3817
3818   if (dump_enabled_p ())
3819     dump_printf_loc (MSG_NOTE, vect_location,
3820                      "  Runtime profitability threshold = %d\n",
3821                      min_profitable_iters);
3822
3823   *ret_min_profitable_niters = min_profitable_iters;
3824
3825   /* Calculate number of iterations required to make the vector version
3826      profitable, relative to the loop bodies only.
3827
3828      Non-vectorized variant is SIC * niters and it must win over vector
3829      variant on the expected loop trip count.  The following condition must hold true:
3830      SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC  */
3831
3832   if (vec_outside_cost <= 0)
3833     min_profitable_estimate = 0;
3834   else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3835     {
3836       /* This is a repeat of the code above, but with + SOC rather
3837          than - SOC.  */
3838       int outside_overhead = (vec_outside_cost
3839                               - scalar_single_iter_cost * peel_iters_prologue
3840                               - scalar_single_iter_cost * peel_iters_epilogue
3841                               + scalar_outside_cost);
3842       int min_vec_niters = 1;
3843       if (outside_overhead > 0)
3844         min_vec_niters = outside_overhead / saving_per_viter + 1;
3845
3846       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3847         {
3848           int threshold = (vec_inside_cost * min_vec_niters
3849                            + vec_outside_cost
3850                            + scalar_outside_cost);
3851           min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
3852         }
3853       else
3854         min_profitable_estimate = (min_vec_niters * assumed_vf
3855                                    + peel_iters_prologue
3856                                    + peel_iters_epilogue);
3857     }
3858   else
3859     {
3860       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
3861                                  * assumed_vf
3862                                  - vec_inside_cost * peel_iters_prologue
3863                                  - vec_inside_cost * peel_iters_epilogue)
3864                                  / ((scalar_single_iter_cost * assumed_vf)
3865                                    - vec_inside_cost);
3866     }
3867   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3868   if (dump_enabled_p ())
3869     dump_printf_loc (MSG_NOTE, vect_location,
3870                      "  Static estimate profitability threshold = %d\n",
3871                      min_profitable_estimate);
3872
3873   *ret_min_profitable_estimate = min_profitable_estimate;
3874 }
3875
3876 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3877    vector elements (not bits) for a vector with NELT elements.  */
3878 static void
3879 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
3880                               vec_perm_builder *sel)
3881 {
3882   /* The encoding is a single stepped pattern.  Any wrap-around is handled
3883      by vec_perm_indices.  */
3884   sel->new_vector (nelt, 1, 3);
3885   for (unsigned int i = 0; i < 3; i++)
3886     sel->quick_push (i + offset);
3887 }
3888
3889 /* Checks whether the target supports whole-vector shifts for vectors of mode
3890    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
3891    it supports vec_perm_const with masks for all necessary shift amounts.  */
3892 static bool
3893 have_whole_vector_shift (machine_mode mode)
3894 {
3895   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3896     return true;
3897
3898   /* Variable-length vectors should be handled via the optab.  */
3899   unsigned int nelt;
3900   if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
3901     return false;
3902
3903   vec_perm_builder sel;
3904   vec_perm_indices indices;
3905   for (unsigned int i = nelt / 2; i >= 1; i /= 2)
3906     {
3907       calc_vec_perm_mask_for_shift (i, nelt, &sel);
3908       indices.new_vector (sel, 2, nelt);
3909       if (!can_vec_perm_const_p (mode, indices, false))
3910         return false;
3911     }
3912   return true;
3913 }
3914
3915 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3916    functions. Design better to avoid maintenance issues.  */
3917
3918 /* Function vect_model_reduction_cost.
3919
3920    Models cost for a reduction operation, including the vector ops
3921    generated within the strip-mine loop, the initial definition before
3922    the loop, and the epilogue code that must be generated.  */
3923
3924 static void
3925 vect_model_reduction_cost (loop_vec_info loop_vinfo,
3926                            stmt_vec_info stmt_info, internal_fn reduc_fn,
3927                            vect_reduction_type reduction_type,
3928                            int ncopies, stmt_vector_for_cost *cost_vec)
3929 {
3930   int prologue_cost = 0, epilogue_cost = 0, inside_cost;
3931   enum tree_code code;
3932   optab optab;
3933   tree vectype;
3934   machine_mode mode;
3935   class loop *loop = NULL;
3936
3937   if (loop_vinfo)
3938     loop = LOOP_VINFO_LOOP (loop_vinfo);
3939
3940   /* Condition reductions generate two reductions in the loop.  */
3941   if (reduction_type == COND_REDUCTION)
3942     ncopies *= 2;
3943
3944   vectype = STMT_VINFO_VECTYPE (stmt_info);
3945   mode = TYPE_MODE (vectype);
3946   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
3947
3948   code = gimple_assign_rhs_code (orig_stmt_info->stmt);
3949
3950   if (reduction_type == EXTRACT_LAST_REDUCTION)
3951     /* No extra instructions are needed in the prologue.  The loop body
3952        operations are costed in vectorizable_condition.  */
3953     inside_cost = 0;
3954   else if (reduction_type == FOLD_LEFT_REDUCTION)
3955     {
3956       /* No extra instructions needed in the prologue.  */
3957       prologue_cost = 0;
3958
3959       if (reduc_fn != IFN_LAST)
3960         /* Count one reduction-like operation per vector.  */
3961         inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
3962                                         stmt_info, 0, vect_body);
3963       else
3964         {
3965           /* Use NELEMENTS extracts and NELEMENTS scalar ops.  */
3966           unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
3967           inside_cost = record_stmt_cost (cost_vec, nelements,
3968                                           vec_to_scalar, stmt_info, 0,
3969                                           vect_body);
3970           inside_cost += record_stmt_cost (cost_vec, nelements,
3971                                            scalar_stmt, stmt_info, 0,
3972                                            vect_body);
3973         }
3974     }
3975   else
3976     {
3977       /* Add in cost for initial definition.
3978          For cond reduction we have four vectors: initial index, step,
3979          initial result of the data reduction, initial value of the index
3980          reduction.  */
3981       int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
3982       prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
3983                                          scalar_to_vec, stmt_info, 0,
3984                                          vect_prologue);
3985
3986       /* Cost of reduction op inside loop.  */
3987       inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
3988                                       stmt_info, 0, vect_body);
3989     }
3990
3991   /* Determine cost of epilogue code.
3992
3993      We have a reduction operator that will reduce the vector in one statement.
3994      Also requires scalar extract.  */
3995
3996   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
3997     {
3998       if (reduc_fn != IFN_LAST)
3999         {
4000           if (reduction_type == COND_REDUCTION)
4001             {
4002               /* An EQ stmt and an COND_EXPR stmt.  */
4003               epilogue_cost += record_stmt_cost (cost_vec, 2,
4004                                                  vector_stmt, stmt_info, 0,
4005                                                  vect_epilogue);
4006               /* Reduction of the max index and a reduction of the found
4007                  values.  */
4008               epilogue_cost += record_stmt_cost (cost_vec, 2,
4009                                                  vec_to_scalar, stmt_info, 0,
4010                                                  vect_epilogue);
4011               /* A broadcast of the max value.  */
4012               epilogue_cost += record_stmt_cost (cost_vec, 1,
4013                                                  scalar_to_vec, stmt_info, 0,
4014                                                  vect_epilogue);
4015             }
4016           else
4017             {
4018               epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
4019                                                  stmt_info, 0, vect_epilogue);
4020               epilogue_cost += record_stmt_cost (cost_vec, 1,
4021                                                  vec_to_scalar, stmt_info, 0,
4022                                                  vect_epilogue);
4023             }
4024         }
4025       else if (reduction_type == COND_REDUCTION)
4026         {
4027           unsigned estimated_nunits = vect_nunits_for_cost (vectype);
4028           /* Extraction of scalar elements.  */
4029           epilogue_cost += record_stmt_cost (cost_vec,
4030                                              2 * estimated_nunits,
4031                                              vec_to_scalar, stmt_info, 0,
4032                                              vect_epilogue);
4033           /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
4034           epilogue_cost += record_stmt_cost (cost_vec,
4035                                              2 * estimated_nunits - 3,
4036                                              scalar_stmt, stmt_info, 0,
4037                                              vect_epilogue);
4038         }
4039       else if (reduction_type == EXTRACT_LAST_REDUCTION
4040                || reduction_type == FOLD_LEFT_REDUCTION)
4041         /* No extra instructions need in the epilogue.  */
4042         ;
4043       else
4044         {
4045           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4046           tree bitsize =
4047             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt_info->stmt)));
4048           int element_bitsize = tree_to_uhwi (bitsize);
4049           int nelements = vec_size_in_bits / element_bitsize;
4050
4051           if (code == COND_EXPR)
4052             code = MAX_EXPR;
4053
4054           optab = optab_for_tree_code (code, vectype, optab_default);
4055
4056           /* We have a whole vector shift available.  */
4057           if (optab != unknown_optab
4058               && VECTOR_MODE_P (mode)
4059               && optab_handler (optab, mode) != CODE_FOR_nothing
4060               && have_whole_vector_shift (mode))
4061             {
4062               /* Final reduction via vector shifts and the reduction operator.
4063                  Also requires scalar extract.  */
4064               epilogue_cost += record_stmt_cost (cost_vec,
4065                                                  exact_log2 (nelements) * 2,
4066                                                  vector_stmt, stmt_info, 0,
4067                                                  vect_epilogue);
4068               epilogue_cost += record_stmt_cost (cost_vec, 1,
4069                                                  vec_to_scalar, stmt_info, 0,
4070                                                  vect_epilogue);
4071             }
4072           else
4073             /* Use extracts and reduction op for final reduction.  For N
4074                elements, we have N extracts and N-1 reduction ops.  */
4075             epilogue_cost += record_stmt_cost (cost_vec,
4076                                                nelements + nelements - 1,
4077                                                vector_stmt, stmt_info, 0,
4078                                                vect_epilogue);
4079         }
4080     }
4081
4082   if (dump_enabled_p ())
4083     dump_printf (MSG_NOTE,
4084                  "vect_model_reduction_cost: inside_cost = %d, "
4085                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4086                  prologue_cost, epilogue_cost);
4087 }
4088
4089
4090 /* Function vect_model_induction_cost.
4091
4092    Models cost for induction operations.  */
4093
4094 static void
4095 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies,
4096                            stmt_vector_for_cost *cost_vec)
4097 {
4098   unsigned inside_cost, prologue_cost;
4099
4100   if (PURE_SLP_STMT (stmt_info))
4101     return;
4102
4103   /* loop cost for vec_loop.  */
4104   inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
4105                                   stmt_info, 0, vect_body);
4106
4107   /* prologue cost for vec_init and vec_step.  */
4108   prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
4109                                     stmt_info, 0, vect_prologue);
4110
4111   if (dump_enabled_p ())
4112     dump_printf_loc (MSG_NOTE, vect_location,
4113                      "vect_model_induction_cost: inside_cost = %d, "
4114                      "prologue_cost = %d .\n", inside_cost, prologue_cost);
4115 }
4116
4117
4118
4119 /* Function get_initial_def_for_reduction
4120
4121    Input:
4122    STMT_VINFO - a stmt that performs a reduction operation in the loop.
4123    INIT_VAL - the initial value of the reduction variable
4124
4125    Output:
4126    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4127         of the reduction (used for adjusting the epilog - see below).
4128    Return a vector variable, initialized according to the operation that
4129         STMT_VINFO performs. This vector will be used as the initial value
4130         of the vector of partial results.
4131
4132    Option1 (adjust in epilog): Initialize the vector as follows:
4133      add/bit or/xor:    [0,0,...,0,0]
4134      mult/bit and:      [1,1,...,1,1]
4135      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4136    and when necessary (e.g. add/mult case) let the caller know
4137    that it needs to adjust the result by init_val.
4138
4139    Option2: Initialize the vector as follows:
4140      add/bit or/xor:    [init_val,0,0,...,0]
4141      mult/bit and:      [init_val,1,1,...,1]
4142      min/max/cond_expr: [init_val,init_val,...,init_val]
4143    and no adjustments are needed.
4144
4145    For example, for the following code:
4146
4147    s = init_val;
4148    for (i=0;i<n;i++)
4149      s = s + a[i];
4150
4151    STMT_VINFO is 's = s + a[i]', and the reduction variable is 's'.
4152    For a vector of 4 units, we want to return either [0,0,0,init_val],
4153    or [0,0,0,0] and let the caller know that it needs to adjust
4154    the result at the end by 'init_val'.
4155
4156    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4157    initialization vector is simpler (same element in all entries), if
4158    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4159
4160    A cost model should help decide between these two schemes.  */
4161
4162 static tree
4163 get_initial_def_for_reduction (loop_vec_info loop_vinfo,
4164                                stmt_vec_info stmt_vinfo,
4165                                enum tree_code code, tree init_val,
4166                                tree *adjustment_def)
4167 {
4168   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4169   tree scalar_type = TREE_TYPE (init_val);
4170   tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
4171   tree def_for_init;
4172   tree init_def;
4173   REAL_VALUE_TYPE real_init_val = dconst0;
4174   int int_init_val = 0;
4175   gimple_seq stmts = NULL;
4176
4177   gcc_assert (vectype);
4178
4179   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4180               || SCALAR_FLOAT_TYPE_P (scalar_type));
4181
4182   gcc_assert (nested_in_vect_loop_p (loop, stmt_vinfo)
4183               || loop == (gimple_bb (stmt_vinfo->stmt))->loop_father);
4184
4185   /* ADJUSTMENT_DEF is NULL when called from
4186      vect_create_epilog_for_reduction to vectorize double reduction.  */
4187   if (adjustment_def)
4188     *adjustment_def = NULL;
4189
4190   switch (code)
4191     {
4192     case WIDEN_SUM_EXPR:
4193     case DOT_PROD_EXPR:
4194     case SAD_EXPR:
4195     case PLUS_EXPR:
4196     case MINUS_EXPR:
4197     case BIT_IOR_EXPR:
4198     case BIT_XOR_EXPR:
4199     case MULT_EXPR:
4200     case BIT_AND_EXPR:
4201       {
4202         if (code == MULT_EXPR)
4203           {
4204             real_init_val = dconst1;
4205             int_init_val = 1;
4206           }
4207
4208         if (code == BIT_AND_EXPR)
4209           int_init_val = -1;
4210
4211         if (SCALAR_FLOAT_TYPE_P (scalar_type))
4212           def_for_init = build_real (scalar_type, real_init_val);
4213         else
4214           def_for_init = build_int_cst (scalar_type, int_init_val);
4215
4216         if (adjustment_def || operand_equal_p (def_for_init, init_val, 0))
4217           {
4218             /* Option1: the first element is '0' or '1' as well.  */
4219             if (!operand_equal_p (def_for_init, init_val, 0))
4220               *adjustment_def = init_val;
4221             init_def = gimple_build_vector_from_val (&stmts, vectype,
4222                                                      def_for_init);
4223           }
4224         else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4225           {
4226             /* Option2 (variable length): the first element is INIT_VAL.  */
4227             init_def = gimple_build_vector_from_val (&stmts, vectype,
4228                                                      def_for_init);
4229             init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4230                                      vectype, init_def, init_val);
4231           }
4232         else
4233           {
4234             /* Option2: the first element is INIT_VAL.  */
4235             tree_vector_builder elts (vectype, 1, 2);
4236             elts.quick_push (init_val);
4237             elts.quick_push (def_for_init);
4238             init_def = gimple_build_vector (&stmts, &elts);
4239           }
4240       }
4241       break;
4242
4243     case MIN_EXPR:
4244     case MAX_EXPR:
4245     case COND_EXPR:
4246       {
4247         init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4248         init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4249       }
4250       break;
4251
4252     default:
4253       gcc_unreachable ();
4254     }
4255
4256   if (stmts)
4257     gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4258   return init_def;
4259 }
4260
4261 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4262    NUMBER_OF_VECTORS is the number of vector defs to create.
4263    If NEUTRAL_OP is nonnull, introducing extra elements of that
4264    value will not change the result.  */
4265
4266 static void
4267 get_initial_defs_for_reduction (vec_info *vinfo,
4268                                 slp_tree slp_node,
4269                                 vec<tree> *vec_oprnds,
4270                                 unsigned int number_of_vectors,
4271                                 bool reduc_chain, tree neutral_op)
4272 {
4273   vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4274   stmt_vec_info stmt_vinfo = stmts[0];
4275   unsigned HOST_WIDE_INT nunits;
4276   unsigned j, number_of_places_left_in_vector;
4277   tree vector_type;
4278   unsigned int group_size = stmts.length ();
4279   unsigned int i;
4280   class loop *loop;
4281
4282   vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4283
4284   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4285
4286   loop = (gimple_bb (stmt_vinfo->stmt))->loop_father;
4287   gcc_assert (loop);
4288   edge pe = loop_preheader_edge (loop);
4289
4290   gcc_assert (!reduc_chain || neutral_op);
4291
4292   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4293      created vectors. It is greater than 1 if unrolling is performed.
4294
4295      For example, we have two scalar operands, s1 and s2 (e.g., group of
4296      strided accesses of size two), while NUNITS is four (i.e., four scalars
4297      of this type can be packed in a vector).  The output vector will contain
4298      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
4299      will be 2).
4300
4301      If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4302      vectors containing the operands.
4303
4304      For example, NUNITS is four as before, and the group size is 8
4305      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
4306      {s5, s6, s7, s8}.  */
4307
4308   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4309     nunits = group_size;
4310
4311   number_of_places_left_in_vector = nunits;
4312   bool constant_p = true;
4313   tree_vector_builder elts (vector_type, nunits, 1);
4314   elts.quick_grow (nunits);
4315   gimple_seq ctor_seq = NULL;
4316   for (j = 0; j < nunits * number_of_vectors; ++j)
4317     {
4318       tree op;
4319       i = j % group_size;
4320       stmt_vinfo = stmts[i];
4321
4322       /* Get the def before the loop.  In reduction chain we have only
4323          one initial value.  Else we have as many as PHIs in the group.  */
4324       if (reduc_chain)
4325         op = j != 0 ? neutral_op : PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4326       else if (((vec_oprnds->length () + 1) * nunits
4327                 - number_of_places_left_in_vector >= group_size)
4328                && neutral_op)
4329         op = neutral_op;
4330       else
4331         op = PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4332
4333       /* Create 'vect_ = {op0,op1,...,opn}'.  */
4334       number_of_places_left_in_vector--;
4335       elts[nunits - number_of_places_left_in_vector - 1] = op;
4336       if (!CONSTANT_CLASS_P (op))
4337         constant_p = false;
4338
4339       if (number_of_places_left_in_vector == 0)
4340         {
4341           tree init;
4342           if (constant_p && !neutral_op
4343               ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4344               : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4345             /* Build the vector directly from ELTS.  */
4346             init = gimple_build_vector (&ctor_seq, &elts);
4347           else if (neutral_op)
4348             {
4349               /* Build a vector of the neutral value and shift the
4350                  other elements into place.  */
4351               init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4352                                                    neutral_op);
4353               int k = nunits;
4354               while (k > 0 && elts[k - 1] == neutral_op)
4355                 k -= 1;
4356               while (k > 0)
4357                 {
4358                   k -= 1;
4359                   init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4360                                        vector_type, init, elts[k]);
4361                 }
4362             }
4363           else
4364             {
4365               /* First time round, duplicate ELTS to fill the
4366                  required number of vectors.  */
4367               duplicate_and_interleave (vinfo, &ctor_seq, vector_type, elts,
4368                                         number_of_vectors, *vec_oprnds);
4369               break;
4370             }
4371           vec_oprnds->quick_push (init);
4372
4373           number_of_places_left_in_vector = nunits;
4374           elts.new_vector (vector_type, nunits, 1);
4375           elts.quick_grow (nunits);
4376           constant_p = true;
4377         }
4378     }
4379   if (ctor_seq != NULL)
4380     gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4381 }
4382
4383 /* For a statement STMT_INFO taking part in a reduction operation return
4384    the stmt_vec_info the meta information is stored on.  */
4385
4386 stmt_vec_info
4387 info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
4388 {
4389   stmt_info = vect_orig_stmt (stmt_info);
4390   gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
4391   if (!is_a <gphi *> (stmt_info->stmt))
4392     stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4393   gphi *phi = as_a <gphi *> (stmt_info->stmt);
4394   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
4395     {
4396       if (gimple_phi_num_args (phi) == 1)
4397         stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4398     }
4399   else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
4400     {
4401       edge pe = loop_preheader_edge (gimple_bb (phi)->loop_father);
4402       stmt_vec_info info
4403           = vinfo->lookup_def (PHI_ARG_DEF_FROM_EDGE (phi, pe));
4404       if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
4405         stmt_info = info;
4406     }
4407   return stmt_info;
4408 }
4409
4410 /* Function vect_create_epilog_for_reduction
4411
4412    Create code at the loop-epilog to finalize the result of a reduction
4413    computation.
4414
4415    STMT_INFO is the scalar reduction stmt that is being vectorized.
4416    SLP_NODE is an SLP node containing a group of reduction statements. The
4417      first one in this group is STMT_INFO.
4418    SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
4419    REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
4420      (counting from 0)
4421
4422    This function:
4423    1. Completes the reduction def-use cycles.
4424    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4425       by calling the function specified by REDUC_FN if available, or by
4426       other means (whole-vector shifts or a scalar loop).
4427       The function also creates a new phi node at the loop exit to preserve
4428       loop-closed form, as illustrated below.
4429
4430      The flow at the entry to this function:
4431
4432         loop:
4433           vec_def = phi <vec_init, null>        # REDUCTION_PHI
4434           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
4435           s_loop = scalar_stmt                  # (scalar) STMT_INFO
4436         loop_exit:
4437           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4438           use <s_out0>
4439           use <s_out0>
4440
4441      The above is transformed by this function into:
4442
4443         loop:
4444           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4445           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
4446           s_loop = scalar_stmt                  # (scalar) STMT_INFO
4447         loop_exit:
4448           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4449           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4450           v_out2 = reduce <v_out1>
4451           s_out3 = extract_field <v_out2, 0>
4452           s_out4 = adjust_result <s_out3>
4453           use <s_out4>
4454           use <s_out4>
4455 */
4456
4457 static void
4458 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
4459                                   stmt_vec_info stmt_info,
4460                                   slp_tree slp_node,
4461                                   slp_instance slp_node_instance)
4462 {
4463   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
4464   gcc_assert (reduc_info->is_reduc_info);
4465   /* For double reductions we need to get at the inner loop reduction
4466      stmt which has the meta info attached.  Our stmt_info is that of the
4467      loop-closed PHI of the inner loop which we remember as
4468      def for the reduction PHI generation.  */
4469   bool double_reduc = false;
4470   stmt_vec_info rdef_info = stmt_info;
4471   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
4472     {
4473       gcc_assert (!slp_node);
4474       double_reduc = true;
4475       stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
4476                                             (stmt_info->stmt, 0));
4477       stmt_info = vect_stmt_to_vectorize (stmt_info);
4478     }
4479   gphi *reduc_def_stmt
4480     = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
4481   enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
4482   internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
4483   stmt_vec_info prev_phi_info;
4484   tree vectype;
4485   machine_mode mode;
4486   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4487   basic_block exit_bb;
4488   tree scalar_dest;
4489   tree scalar_type;
4490   gimple *new_phi = NULL, *phi;
4491   stmt_vec_info phi_info;
4492   gimple_stmt_iterator exit_gsi;
4493   tree new_temp = NULL_TREE, new_name, new_scalar_dest;
4494   gimple *epilog_stmt = NULL;
4495   gimple *exit_phi;
4496   tree bitsize;
4497   tree def;
4498   tree orig_name, scalar_result;
4499   imm_use_iterator imm_iter, phi_imm_iter;
4500   use_operand_p use_p, phi_use_p;
4501   gimple *use_stmt;
4502   bool nested_in_vect_loop = false;
4503   auto_vec<gimple *> new_phis;
4504   int j, i;
4505   auto_vec<tree> scalar_results;
4506   unsigned int group_size = 1, k;
4507   auto_vec<gimple *> phis;
4508   bool slp_reduc = false;
4509   bool direct_slp_reduc;
4510   tree new_phi_result;
4511   tree induction_index = NULL_TREE;
4512
4513   if (slp_node)
4514     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4515
4516   if (nested_in_vect_loop_p (loop, stmt_info))
4517     {
4518       outer_loop = loop;
4519       loop = loop->inner;
4520       nested_in_vect_loop = true;
4521       gcc_assert (!slp_node);
4522     }
4523   gcc_assert (!nested_in_vect_loop || double_reduc);
4524
4525   vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
4526   gcc_assert (vectype);
4527   mode = TYPE_MODE (vectype);
4528
4529   tree initial_def = NULL;
4530   tree induc_val = NULL_TREE;
4531   tree adjustment_def = NULL;
4532   if (slp_node)
4533     ;
4534   else
4535     {
4536       /* Get at the scalar def before the loop, that defines the initial value
4537          of the reduction variable.  */
4538       initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4539                                            loop_preheader_edge (loop));
4540       /* Optimize: for induction condition reduction, if we can't use zero
4541          for induc_val, use initial_def.  */
4542       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
4543         induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
4544       else if (double_reduc)
4545         ;
4546       else if (nested_in_vect_loop)
4547         ;
4548       else
4549         adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
4550     }
4551
4552   unsigned vec_num;
4553   int ncopies;
4554   if (slp_node)
4555     {
4556       vec_num = SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis).length ();
4557       ncopies = 1;
4558     }
4559   else
4560     {
4561       vec_num = 1;
4562       ncopies = 0;
4563       phi_info = STMT_VINFO_VEC_STMT (loop_vinfo->lookup_stmt (reduc_def_stmt));
4564       do
4565         {
4566           ncopies++;
4567           phi_info = STMT_VINFO_RELATED_STMT (phi_info);
4568         }
4569       while (phi_info);
4570     }
4571
4572   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4573      which is updated with the current index of the loop for every match of
4574      the original loop's cond_expr (VEC_STMT).  This results in a vector
4575      containing the last time the condition passed for that vector lane.
4576      The first match will be a 1 to allow 0 to be used for non-matching
4577      indexes.  If there are no matches at all then the vector will be all
4578      zeroes.
4579
4580      PR92772: This algorithm is broken for architectures that support
4581      masked vectors, but do not provide fold_extract_last.  */
4582   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
4583     {
4584       auto_vec<std::pair<tree, bool>, 2> ccompares;
4585       stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
4586       cond_info = vect_stmt_to_vectorize (cond_info);
4587       while (cond_info != reduc_info)
4588         {
4589           if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
4590             {
4591               gimple *vec_stmt = STMT_VINFO_VEC_STMT (cond_info)->stmt;
4592               gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4593               ccompares.safe_push
4594                 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
4595                                  STMT_VINFO_REDUC_IDX (cond_info) == 2));
4596             }
4597           cond_info
4598             = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
4599                                                  1 + STMT_VINFO_REDUC_IDX
4600                                                         (cond_info)));
4601           cond_info = vect_stmt_to_vectorize (cond_info);
4602         }
4603       gcc_assert (ccompares.length () != 0);
4604
4605       tree indx_before_incr, indx_after_incr;
4606       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4607       int scalar_precision
4608         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4609       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4610       tree cr_index_vector_type = get_related_vectype_for_scalar_type
4611         (TYPE_MODE (vectype), cr_index_scalar_type,
4612          TYPE_VECTOR_SUBPARTS (vectype));
4613
4614       /* First we create a simple vector induction variable which starts
4615          with the values {1,2,3,...} (SERIES_VECT) and increments by the
4616          vector size (STEP).  */
4617
4618       /* Create a {1,2,3,...} vector.  */
4619       tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
4620
4621       /* Create a vector of the step value.  */
4622       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4623       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4624
4625       /* Create an induction variable.  */
4626       gimple_stmt_iterator incr_gsi;
4627       bool insert_after;
4628       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4629       create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4630                  insert_after, &indx_before_incr, &indx_after_incr);
4631
4632       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4633          filled with zeros (VEC_ZERO).  */
4634
4635       /* Create a vector of 0s.  */
4636       tree zero = build_zero_cst (cr_index_scalar_type);
4637       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4638
4639       /* Create a vector phi node.  */
4640       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4641       new_phi = create_phi_node (new_phi_tree, loop->header);
4642       loop_vinfo->add_stmt (new_phi);
4643       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4644                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
4645
4646       /* Now take the condition from the loops original cond_exprs
4647          and produce a new cond_exprs (INDEX_COND_EXPR) which for
4648          every match uses values from the induction variable
4649          (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4650          (NEW_PHI_TREE).
4651          Finally, we update the phi (NEW_PHI_TREE) to take the value of
4652          the new cond_expr (INDEX_COND_EXPR).  */
4653       gimple_seq stmts = NULL;
4654       for (int i = ccompares.length () - 1; i != -1; --i)
4655         {
4656           tree ccompare = ccompares[i].first;
4657           if (ccompares[i].second)
4658             new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
4659                                          cr_index_vector_type,
4660                                          ccompare,
4661                                          indx_before_incr, new_phi_tree);
4662           else
4663             new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
4664                                          cr_index_vector_type,
4665                                          ccompare,
4666                                          new_phi_tree, indx_before_incr);
4667         }
4668       gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
4669       stmt_vec_info index_vec_info
4670         = loop_vinfo->add_stmt (SSA_NAME_DEF_STMT (new_phi_tree));
4671       STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
4672
4673       /* Update the phi with the vec cond.  */
4674       induction_index = new_phi_tree;
4675       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4676                    loop_latch_edge (loop), UNKNOWN_LOCATION);
4677     }
4678
4679   /* 2. Create epilog code.
4680         The reduction epilog code operates across the elements of the vector
4681         of partial results computed by the vectorized loop.
4682         The reduction epilog code consists of:
4683
4684         step 1: compute the scalar result in a vector (v_out2)
4685         step 2: extract the scalar result (s_out3) from the vector (v_out2)
4686         step 3: adjust the scalar result (s_out3) if needed.
4687
4688         Step 1 can be accomplished using one the following three schemes:
4689           (scheme 1) using reduc_fn, if available.
4690           (scheme 2) using whole-vector shifts, if available.
4691           (scheme 3) using a scalar loop. In this case steps 1+2 above are
4692                      combined.
4693
4694           The overall epilog code looks like this:
4695
4696           s_out0 = phi <s_loop>         # original EXIT_PHI
4697           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
4698           v_out2 = reduce <v_out1>              # step 1
4699           s_out3 = extract_field <v_out2, 0>    # step 2
4700           s_out4 = adjust_result <s_out3>       # step 3
4701
4702           (step 3 is optional, and steps 1 and 2 may be combined).
4703           Lastly, the uses of s_out0 are replaced by s_out4.  */
4704
4705
4706   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4707          v_out1 = phi <VECT_DEF>
4708          Store them in NEW_PHIS.  */
4709   if (double_reduc)
4710     loop = outer_loop;
4711   exit_bb = single_exit (loop)->dest;
4712   prev_phi_info = NULL;
4713   new_phis.create (slp_node ? vec_num : ncopies);
4714   for (unsigned i = 0; i < vec_num; i++)
4715     {
4716       if (slp_node)
4717         def = gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[i]->stmt);
4718       else
4719         def = gimple_get_lhs (STMT_VINFO_VEC_STMT (rdef_info)->stmt);
4720       for (j = 0; j < ncopies; j++)
4721         {
4722           tree new_def = copy_ssa_name (def);
4723           phi = create_phi_node (new_def, exit_bb);
4724           stmt_vec_info phi_info = loop_vinfo->add_stmt (phi);
4725           if (j == 0)
4726             new_phis.quick_push (phi);
4727           else
4728             {
4729               def = vect_get_vec_def_for_stmt_copy (loop_vinfo, def);
4730               STMT_VINFO_RELATED_STMT (prev_phi_info) = phi_info;
4731             }
4732
4733           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4734           prev_phi_info = phi_info;
4735         }
4736     }
4737
4738   exit_gsi = gsi_after_labels (exit_bb);
4739
4740   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4741          (i.e. when reduc_fn is not available) and in the final adjustment
4742          code (if needed).  Also get the original scalar reduction variable as
4743          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
4744          represents a reduction pattern), the tree-code and scalar-def are
4745          taken from the original stmt that the pattern-stmt (STMT) replaces.
4746          Otherwise (it is a regular reduction) - the tree-code and scalar-def
4747          are taken from STMT.  */
4748
4749   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4750   if (orig_stmt_info != stmt_info)
4751     {
4752       /* Reduction pattern  */
4753       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4754       gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
4755     }
4756
4757   scalar_dest = gimple_assign_lhs (orig_stmt_info->stmt);
4758   scalar_type = TREE_TYPE (scalar_dest);
4759   scalar_results.create (group_size);
4760   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4761   bitsize = TYPE_SIZE (scalar_type);
4762
4763   /* SLP reduction without reduction chain, e.g.,
4764      # a1 = phi <a2, a0>
4765      # b1 = phi <b2, b0>
4766      a2 = operation (a1)
4767      b2 = operation (b1)  */
4768   slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
4769
4770   /* True if we should implement SLP_REDUC using native reduction operations
4771      instead of scalar operations.  */
4772   direct_slp_reduc = (reduc_fn != IFN_LAST
4773                       && slp_reduc
4774                       && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
4775
4776   /* In case of reduction chain, e.g.,
4777      # a1 = phi <a3, a0>
4778      a2 = operation (a1)
4779      a3 = operation (a2),
4780
4781      we may end up with more than one vector result.  Here we reduce them to
4782      one vector.  */
4783   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) || direct_slp_reduc)
4784     {
4785       gimple_seq stmts = NULL;
4786       tree first_vect = PHI_RESULT (new_phis[0]);
4787       first_vect = gimple_convert (&stmts, vectype, first_vect);
4788       for (k = 1; k < new_phis.length (); k++)
4789         {
4790           gimple *next_phi = new_phis[k];
4791           tree second_vect = PHI_RESULT (next_phi);
4792           second_vect = gimple_convert (&stmts, vectype, second_vect);
4793           first_vect = gimple_build (&stmts, code, vectype,
4794                                      first_vect, second_vect);
4795         }
4796       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4797
4798       new_phi_result = first_vect;
4799       new_phis.truncate (0);
4800       new_phis.safe_push (SSA_NAME_DEF_STMT (first_vect));
4801     }
4802   /* Likewise if we couldn't use a single defuse cycle.  */
4803   else if (ncopies > 1)
4804     {
4805       gcc_assert (new_phis.length () == 1);
4806       gimple_seq stmts = NULL;
4807       tree first_vect = PHI_RESULT (new_phis[0]);
4808       first_vect = gimple_convert (&stmts, vectype, first_vect);
4809       stmt_vec_info next_phi_info = loop_vinfo->lookup_stmt (new_phis[0]);
4810       for (int k = 1; k < ncopies; ++k)
4811         {
4812           next_phi_info = STMT_VINFO_RELATED_STMT (next_phi_info);
4813           tree second_vect = PHI_RESULT (next_phi_info->stmt);
4814           second_vect = gimple_convert (&stmts, vectype, second_vect);
4815           first_vect = gimple_build (&stmts, code, vectype,
4816                                      first_vect, second_vect);
4817         }
4818       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4819       new_phi_result = first_vect;
4820       new_phis.truncate (0);
4821       new_phis.safe_push (SSA_NAME_DEF_STMT (first_vect));
4822     }
4823   else
4824     new_phi_result = PHI_RESULT (new_phis[0]);
4825
4826   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
4827       && reduc_fn != IFN_LAST)
4828     {
4829       /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4830          various data values where the condition matched and another vector
4831          (INDUCTION_INDEX) containing all the indexes of those matches.  We
4832          need to extract the last matching index (which will be the index with
4833          highest value) and use this to index into the data vector.
4834          For the case where there were no matches, the data vector will contain
4835          all default values and the index vector will be all zeros.  */
4836
4837       /* Get various versions of the type of the vector of indexes.  */
4838       tree index_vec_type = TREE_TYPE (induction_index);
4839       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4840       tree index_scalar_type = TREE_TYPE (index_vec_type);
4841       tree index_vec_cmp_type = truth_type_for (index_vec_type);
4842
4843       /* Get an unsigned integer version of the type of the data vector.  */
4844       int scalar_precision
4845         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
4846       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4847       tree vectype_unsigned = build_vector_type
4848         (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4849
4850       /* First we need to create a vector (ZERO_VEC) of zeros and another
4851          vector (MAX_INDEX_VEC) filled with the last matching index, which we
4852          can create using a MAX reduction and then expanding.
4853          In the case where the loop never made any matches, the max index will
4854          be zero.  */
4855
4856       /* Vector of {0, 0, 0,...}.  */
4857       tree zero_vec = build_zero_cst (vectype);
4858
4859       gimple_seq stmts = NULL;
4860       new_phi_result = gimple_convert (&stmts, vectype, new_phi_result);
4861       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4862
4863       /* Find maximum value from the vector of found indexes.  */
4864       tree max_index = make_ssa_name (index_scalar_type);
4865       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4866                                                           1, induction_index);
4867       gimple_call_set_lhs (max_index_stmt, max_index);
4868       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
4869
4870       /* Vector of {max_index, max_index, max_index,...}.  */
4871       tree max_index_vec = make_ssa_name (index_vec_type);
4872       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
4873                                                       max_index);
4874       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
4875                                                         max_index_vec_rhs);
4876       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
4877
4878       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
4879          with the vector (INDUCTION_INDEX) of found indexes, choosing values
4880          from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
4881          otherwise.  Only one value should match, resulting in a vector
4882          (VEC_COND) with one data value and the rest zeros.
4883          In the case where the loop never made any matches, every index will
4884          match, resulting in a vector with all data values (which will all be
4885          the default value).  */
4886
4887       /* Compare the max index vector to the vector of found indexes to find
4888          the position of the max value.  */
4889       tree vec_compare = make_ssa_name (index_vec_cmp_type);
4890       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
4891                                                       induction_index,
4892                                                       max_index_vec);
4893       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
4894
4895       /* Use the compare to choose either values from the data vector or
4896          zero.  */
4897       tree vec_cond = make_ssa_name (vectype);
4898       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
4899                                                    vec_compare, new_phi_result,
4900                                                    zero_vec);
4901       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
4902
4903       /* Finally we need to extract the data value from the vector (VEC_COND)
4904          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
4905          reduction, but because this doesn't exist, we can use a MAX reduction
4906          instead.  The data value might be signed or a float so we need to cast
4907          it first.
4908          In the case where the loop never made any matches, the data values are
4909          all identical, and so will reduce down correctly.  */
4910
4911       /* Make the matched data values unsigned.  */
4912       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
4913       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
4914                                        vec_cond);
4915       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
4916                                                         VIEW_CONVERT_EXPR,
4917                                                         vec_cond_cast_rhs);
4918       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
4919
4920       /* Reduce down to a scalar value.  */
4921       tree data_reduc = make_ssa_name (scalar_type_unsigned);
4922       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4923                                                            1, vec_cond_cast);
4924       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
4925       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
4926
4927       /* Convert the reduced value back to the result type and set as the
4928          result.  */
4929       stmts = NULL;
4930       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
4931                                data_reduc);
4932       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4933       scalar_results.safe_push (new_temp);
4934     }
4935   else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
4936            && reduc_fn == IFN_LAST)
4937     {
4938       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
4939          idx = 0;
4940          idx_val = induction_index[0];
4941          val = data_reduc[0];
4942          for (idx = 0, val = init, i = 0; i < nelts; ++i)
4943            if (induction_index[i] > idx_val)
4944              val = data_reduc[i], idx_val = induction_index[i];
4945          return val;  */
4946
4947       tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
4948       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
4949       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
4950       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
4951       /* Enforced by vectorizable_reduction, which ensures we have target
4952          support before allowing a conditional reduction on variable-length
4953          vectors.  */
4954       unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
4955       tree idx_val = NULL_TREE, val = NULL_TREE;
4956       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
4957         {
4958           tree old_idx_val = idx_val;
4959           tree old_val = val;
4960           idx_val = make_ssa_name (idx_eltype);
4961           epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
4962                                              build3 (BIT_FIELD_REF, idx_eltype,
4963                                                      induction_index,
4964                                                      bitsize_int (el_size),
4965                                                      bitsize_int (off)));
4966           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4967           val = make_ssa_name (data_eltype);
4968           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
4969                                              build3 (BIT_FIELD_REF,
4970                                                      data_eltype,
4971                                                      new_phi_result,
4972                                                      bitsize_int (el_size),
4973                                                      bitsize_int (off)));
4974           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4975           if (off != 0)
4976             {
4977               tree new_idx_val = idx_val;
4978               if (off != v_size - el_size)
4979                 {
4980                   new_idx_val = make_ssa_name (idx_eltype);
4981                   epilog_stmt = gimple_build_assign (new_idx_val,
4982                                                      MAX_EXPR, idx_val,
4983                                                      old_idx_val);
4984                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4985                 }
4986               tree new_val = make_ssa_name (data_eltype);
4987               epilog_stmt = gimple_build_assign (new_val,
4988                                                  COND_EXPR,
4989                                                  build2 (GT_EXPR,
4990                                                          boolean_type_node,
4991                                                          idx_val,
4992                                                          old_idx_val),
4993                                                  val, old_val);
4994               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4995               idx_val = new_idx_val;
4996               val = new_val;
4997             }
4998         }
4999       /* Convert the reduced value back to the result type and set as the
5000          result.  */
5001       gimple_seq stmts = NULL;
5002       val = gimple_convert (&stmts, scalar_type, val);
5003       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5004       scalar_results.safe_push (val);
5005     }
5006
5007   /* 2.3 Create the reduction code, using one of the three schemes described
5008          above. In SLP we simply need to extract all the elements from the
5009          vector (without reducing them), so we use scalar shifts.  */
5010   else if (reduc_fn != IFN_LAST && !slp_reduc)
5011     {
5012       tree tmp;
5013       tree vec_elem_type;
5014
5015       /* Case 1:  Create:
5016          v_out2 = reduc_expr <v_out1>  */
5017
5018       if (dump_enabled_p ())
5019         dump_printf_loc (MSG_NOTE, vect_location,
5020                          "Reduce using direct vector reduction.\n");
5021
5022       gimple_seq stmts = NULL;
5023       new_phi_result = gimple_convert (&stmts, vectype, new_phi_result);
5024       vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
5025       new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
5026                                vec_elem_type, new_phi_result);
5027       new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5028       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5029
5030       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5031           && induc_val)
5032         {
5033           /* Earlier we set the initial value to be a vector if induc_val
5034              values.  Check the result and if it is induc_val then replace
5035              with the original initial value, unless induc_val is
5036              the same as initial_def already.  */
5037           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5038                                   induc_val);
5039
5040           tmp = make_ssa_name (new_scalar_dest);
5041           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5042                                              initial_def, new_temp);
5043           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5044           new_temp = tmp;
5045         }
5046
5047       scalar_results.safe_push (new_temp);
5048     }
5049   else if (direct_slp_reduc)
5050     {
5051       /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5052          with the elements for other SLP statements replaced with the
5053          neutral value.  We can then do a normal reduction on each vector.  */
5054
5055       /* Enforced by vectorizable_reduction.  */
5056       gcc_assert (new_phis.length () == 1);
5057       gcc_assert (pow2p_hwi (group_size));
5058
5059       slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
5060       vec<stmt_vec_info> orig_phis
5061         = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
5062       gimple_seq seq = NULL;
5063
5064       /* Build a vector {0, 1, 2, ...}, with the same number of elements
5065          and the same element size as VECTYPE.  */
5066       tree index = build_index_vector (vectype, 0, 1);
5067       tree index_type = TREE_TYPE (index);
5068       tree index_elt_type = TREE_TYPE (index_type);
5069       tree mask_type = truth_type_for (index_type);
5070
5071       /* Create a vector that, for each element, identifies which of
5072          the REDUC_GROUP_SIZE results should use it.  */
5073       tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5074       index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5075                             build_vector_from_val (index_type, index_mask));
5076
5077       /* Get a neutral vector value.  This is simply a splat of the neutral
5078          scalar value if we have one, otherwise the initial scalar value
5079          is itself a neutral value.  */
5080       tree vector_identity = NULL_TREE;
5081       tree neutral_op = NULL_TREE;
5082       if (slp_node)
5083         {
5084           stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
5085           neutral_op
5086             = neutral_op_for_slp_reduction (slp_node_instance->reduc_phis,
5087                                             vectype, code, first != NULL);
5088         }
5089       if (neutral_op)
5090         vector_identity = gimple_build_vector_from_val (&seq, vectype,
5091                                                         neutral_op);
5092       for (unsigned int i = 0; i < group_size; ++i)
5093         {
5094           /* If there's no univeral neutral value, we can use the
5095              initial scalar value from the original PHI.  This is used
5096              for MIN and MAX reduction, for example.  */
5097           if (!neutral_op)
5098             {
5099               tree scalar_value
5100                 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i]->stmt,
5101                                          loop_preheader_edge (loop));
5102               scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
5103                                              scalar_value);
5104               vector_identity = gimple_build_vector_from_val (&seq, vectype,
5105                                                               scalar_value);
5106             }
5107
5108           /* Calculate the equivalent of:
5109
5110              sel[j] = (index[j] == i);
5111
5112              which selects the elements of NEW_PHI_RESULT that should
5113              be included in the result.  */
5114           tree compare_val = build_int_cst (index_elt_type, i);
5115           compare_val = build_vector_from_val (index_type, compare_val);
5116           tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5117                                    index, compare_val);
5118
5119           /* Calculate the equivalent of:
5120
5121              vec = seq ? new_phi_result : vector_identity;
5122
5123              VEC is now suitable for a full vector reduction.  */
5124           tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5125                                    sel, new_phi_result, vector_identity);
5126
5127           /* Do the reduction and convert it to the appropriate type.  */
5128           tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5129                                       TREE_TYPE (vectype), vec);
5130           scalar = gimple_convert (&seq, scalar_type, scalar);
5131           scalar_results.safe_push (scalar);
5132         }
5133       gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5134     }
5135   else
5136     {
5137       bool reduce_with_shift;
5138       tree vec_temp;
5139
5140       gcc_assert (slp_reduc || new_phis.length () == 1);
5141
5142       /* See if the target wants to do the final (shift) reduction
5143          in a vector mode of smaller size and first reduce upper/lower
5144          halves against each other.  */
5145       enum machine_mode mode1 = mode;
5146       tree stype = TREE_TYPE (vectype);
5147       unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5148       unsigned nunits1 = nunits;
5149       if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
5150           && new_phis.length () == 1)
5151         {
5152           nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5153           /* For SLP reductions we have to make sure lanes match up, but
5154              since we're doing individual element final reduction reducing
5155              vector width here is even more important.
5156              ???  We can also separate lanes with permutes, for the common
5157              case of power-of-two group-size odd/even extracts would work.  */
5158           if (slp_reduc && nunits != nunits1)
5159             {
5160               nunits1 = least_common_multiple (nunits1, group_size);
5161               gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
5162             }
5163         }
5164       if (!slp_reduc
5165           && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5166         nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5167
5168       tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5169                                                            stype, nunits1);
5170       reduce_with_shift = have_whole_vector_shift (mode1);
5171       if (!VECTOR_MODE_P (mode1))
5172         reduce_with_shift = false;
5173       else
5174         {
5175           optab optab = optab_for_tree_code (code, vectype1, optab_default);
5176           if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5177             reduce_with_shift = false;
5178         }
5179
5180       /* First reduce the vector to the desired vector size we should
5181          do shift reduction on by combining upper and lower halves.  */
5182       new_temp = new_phi_result;
5183       while (nunits > nunits1)
5184         {
5185           nunits /= 2;
5186           vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5187                                                           stype, nunits);
5188           unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5189
5190           /* The target has to make sure we support lowpart/highpart
5191              extraction, either via direct vector extract or through
5192              an integer mode punning.  */
5193           tree dst1, dst2;
5194           if (convert_optab_handler (vec_extract_optab,
5195                                      TYPE_MODE (TREE_TYPE (new_temp)),
5196                                      TYPE_MODE (vectype1))
5197               != CODE_FOR_nothing)
5198             {
5199               /* Extract sub-vectors directly once vec_extract becomes
5200                  a conversion optab.  */
5201               dst1 = make_ssa_name (vectype1);
5202               epilog_stmt
5203                   = gimple_build_assign (dst1, BIT_FIELD_REF,
5204                                          build3 (BIT_FIELD_REF, vectype1,
5205                                                  new_temp, TYPE_SIZE (vectype1),
5206                                                  bitsize_int (0)));
5207               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5208               dst2 =  make_ssa_name (vectype1);
5209               epilog_stmt
5210                   = gimple_build_assign (dst2, BIT_FIELD_REF,
5211                                          build3 (BIT_FIELD_REF, vectype1,
5212                                                  new_temp, TYPE_SIZE (vectype1),
5213                                                  bitsize_int (bitsize)));
5214               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5215             }
5216           else
5217             {
5218               /* Extract via punning to appropriately sized integer mode
5219                  vector.  */
5220               tree eltype = build_nonstandard_integer_type (bitsize, 1);
5221               tree etype = build_vector_type (eltype, 2);
5222               gcc_assert (convert_optab_handler (vec_extract_optab,
5223                                                  TYPE_MODE (etype),
5224                                                  TYPE_MODE (eltype))
5225                           != CODE_FOR_nothing);
5226               tree tem = make_ssa_name (etype);
5227               epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5228                                                  build1 (VIEW_CONVERT_EXPR,
5229                                                          etype, new_temp));
5230               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5231               new_temp = tem;
5232               tem = make_ssa_name (eltype);
5233               epilog_stmt
5234                   = gimple_build_assign (tem, BIT_FIELD_REF,
5235                                          build3 (BIT_FIELD_REF, eltype,
5236                                                  new_temp, TYPE_SIZE (eltype),
5237                                                  bitsize_int (0)));
5238               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5239               dst1 = make_ssa_name (vectype1);
5240               epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5241                                                  build1 (VIEW_CONVERT_EXPR,
5242                                                          vectype1, tem));
5243               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5244               tem = make_ssa_name (eltype);
5245               epilog_stmt
5246                   = gimple_build_assign (tem, BIT_FIELD_REF,
5247                                          build3 (BIT_FIELD_REF, eltype,
5248                                                  new_temp, TYPE_SIZE (eltype),
5249                                                  bitsize_int (bitsize)));
5250               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5251               dst2 =  make_ssa_name (vectype1);
5252               epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5253                                                  build1 (VIEW_CONVERT_EXPR,
5254                                                          vectype1, tem));
5255               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5256             }
5257
5258           new_temp = make_ssa_name (vectype1);
5259           epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5260           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5261           new_phis[0] = epilog_stmt;
5262         }
5263
5264       if (reduce_with_shift && !slp_reduc)
5265         {
5266           int element_bitsize = tree_to_uhwi (bitsize);
5267           /* Enforced by vectorizable_reduction, which disallows SLP reductions
5268              for variable-length vectors and also requires direct target support
5269              for loop reductions.  */
5270           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5271           int nelements = vec_size_in_bits / element_bitsize;
5272           vec_perm_builder sel;
5273           vec_perm_indices indices;
5274
5275           int elt_offset;
5276
5277           tree zero_vec = build_zero_cst (vectype1);
5278           /* Case 2: Create:
5279              for (offset = nelements/2; offset >= 1; offset/=2)
5280                 {
5281                   Create:  va' = vec_shift <va, offset>
5282                   Create:  va = vop <va, va'>
5283                 }  */
5284
5285           tree rhs;
5286
5287           if (dump_enabled_p ())
5288             dump_printf_loc (MSG_NOTE, vect_location,
5289                              "Reduce using vector shifts\n");
5290
5291           gimple_seq stmts = NULL;
5292           new_temp = gimple_convert (&stmts, vectype1, new_temp);
5293           for (elt_offset = nelements / 2;
5294                elt_offset >= 1;
5295                elt_offset /= 2)
5296             {
5297               calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5298               indices.new_vector (sel, 2, nelements);
5299               tree mask = vect_gen_perm_mask_any (vectype1, indices);
5300               new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
5301                                        new_temp, zero_vec, mask);
5302               new_temp = gimple_build (&stmts, code,
5303                                        vectype1, new_name, new_temp);
5304             }
5305           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5306
5307           /* 2.4  Extract the final scalar result.  Create:
5308              s_out3 = extract_field <v_out2, bitpos>  */
5309
5310           if (dump_enabled_p ())
5311             dump_printf_loc (MSG_NOTE, vect_location,
5312                              "extract scalar result\n");
5313
5314           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5315                         bitsize, bitsize_zero_node);
5316           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5317           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5318           gimple_assign_set_lhs (epilog_stmt, new_temp);
5319           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5320           scalar_results.safe_push (new_temp);
5321         }
5322       else
5323         {
5324           /* Case 3: Create:
5325              s = extract_field <v_out2, 0>
5326              for (offset = element_size;
5327                   offset < vector_size;
5328                   offset += element_size;)
5329                {
5330                  Create:  s' = extract_field <v_out2, offset>
5331                  Create:  s = op <s, s'>  // For non SLP cases
5332                }  */
5333
5334           if (dump_enabled_p ())
5335             dump_printf_loc (MSG_NOTE, vect_location,
5336                              "Reduce using scalar code.\n");
5337
5338           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5339           int element_bitsize = tree_to_uhwi (bitsize);
5340           tree compute_type = TREE_TYPE (vectype);
5341           gimple_seq stmts = NULL;
5342           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5343             {
5344               int bit_offset;
5345               if (gimple_code (new_phi) == GIMPLE_PHI)
5346                 vec_temp = PHI_RESULT (new_phi);
5347               else
5348                 vec_temp = gimple_assign_lhs (new_phi);
5349               new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
5350                                        vec_temp, bitsize, bitsize_zero_node);
5351
5352               /* In SLP we don't need to apply reduction operation, so we just
5353                  collect s' values in SCALAR_RESULTS.  */
5354               if (slp_reduc)
5355                 scalar_results.safe_push (new_temp);
5356
5357               for (bit_offset = element_bitsize;
5358                    bit_offset < vec_size_in_bits;
5359                    bit_offset += element_bitsize)
5360                 {
5361                   tree bitpos = bitsize_int (bit_offset);
5362                   new_name = gimple_build (&stmts, BIT_FIELD_REF,
5363                                            compute_type, vec_temp,
5364                                            bitsize, bitpos);
5365                   if (slp_reduc)
5366                     {
5367                       /* In SLP we don't need to apply reduction operation, so
5368                          we just collect s' values in SCALAR_RESULTS.  */
5369                       new_temp = new_name;
5370                       scalar_results.safe_push (new_name);
5371                     }
5372                   else
5373                     new_temp = gimple_build (&stmts, code, compute_type,
5374                                              new_name, new_temp);
5375                 }
5376             }
5377
5378           /* The only case where we need to reduce scalar results in SLP, is
5379              unrolling.  If the size of SCALAR_RESULTS is greater than
5380              REDUC_GROUP_SIZE, we reduce them combining elements modulo
5381              REDUC_GROUP_SIZE.  */
5382           if (slp_reduc)
5383             {
5384               tree res, first_res, new_res;
5385
5386               /* Reduce multiple scalar results in case of SLP unrolling.  */
5387               for (j = group_size; scalar_results.iterate (j, &res);
5388                    j++)
5389                 {
5390                   first_res = scalar_results[j % group_size];
5391                   new_res = gimple_build (&stmts, code, compute_type,
5392                                           first_res, res);
5393                   scalar_results[j % group_size] = new_res;
5394                 }
5395               for (k = 0; k < group_size; k++)
5396                 scalar_results[k] = gimple_convert (&stmts, scalar_type,
5397                                                     scalar_results[k]);
5398             }
5399           else
5400             {
5401               /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
5402               new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5403               scalar_results.safe_push (new_temp);
5404             }
5405
5406           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5407         }
5408
5409       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5410           && induc_val)
5411         {
5412           /* Earlier we set the initial value to be a vector if induc_val
5413              values.  Check the result and if it is induc_val then replace
5414              with the original initial value, unless induc_val is
5415              the same as initial_def already.  */
5416           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5417                                   induc_val);
5418
5419           tree tmp = make_ssa_name (new_scalar_dest);
5420           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5421                                              initial_def, new_temp);
5422           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5423           scalar_results[0] = tmp;
5424         }
5425     }
5426
5427   /* 2.5 Adjust the final result by the initial value of the reduction
5428          variable. (When such adjustment is not needed, then
5429          'adjustment_def' is zero).  For example, if code is PLUS we create:
5430          new_temp = loop_exit_def + adjustment_def  */
5431
5432   if (adjustment_def)
5433     {
5434       gcc_assert (!slp_reduc);
5435       gimple_seq stmts = NULL;
5436       if (nested_in_vect_loop)
5437         {
5438           new_phi = new_phis[0];
5439           gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
5440           adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
5441           new_temp = gimple_build (&stmts, code, vectype,
5442                                    PHI_RESULT (new_phi), adjustment_def);
5443         }
5444       else
5445         {
5446           new_temp = scalar_results[0];
5447           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5448           adjustment_def = gimple_convert (&stmts, scalar_type, adjustment_def);
5449           new_temp = gimple_build (&stmts, code, scalar_type,
5450                                    new_temp, adjustment_def);
5451         }
5452
5453       epilog_stmt = gimple_seq_last_stmt (stmts);
5454       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5455       if (nested_in_vect_loop)
5456         {
5457           stmt_vec_info epilog_stmt_info = loop_vinfo->add_stmt (epilog_stmt);
5458           STMT_VINFO_RELATED_STMT (epilog_stmt_info)
5459             = STMT_VINFO_RELATED_STMT (loop_vinfo->lookup_stmt (new_phi));
5460
5461           if (!double_reduc)
5462             scalar_results.quick_push (new_temp);
5463           else
5464             scalar_results[0] = new_temp;
5465         }
5466       else
5467         scalar_results[0] = new_temp;
5468
5469       new_phis[0] = epilog_stmt;
5470     }
5471
5472   if (double_reduc)
5473     loop = loop->inner;
5474
5475   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
5476           phis with new adjusted scalar results, i.e., replace use <s_out0>
5477           with use <s_out4>.
5478
5479      Transform:
5480         loop_exit:
5481           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5482           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5483           v_out2 = reduce <v_out1>
5484           s_out3 = extract_field <v_out2, 0>
5485           s_out4 = adjust_result <s_out3>
5486           use <s_out0>
5487           use <s_out0>
5488
5489      into:
5490
5491         loop_exit:
5492           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5493           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5494           v_out2 = reduce <v_out1>
5495           s_out3 = extract_field <v_out2, 0>
5496           s_out4 = adjust_result <s_out3>
5497           use <s_out4>
5498           use <s_out4> */
5499
5500
5501   /* In SLP reduction chain we reduce vector results into one vector if
5502      necessary, hence we set here REDUC_GROUP_SIZE to 1.  SCALAR_DEST is the
5503      LHS of the last stmt in the reduction chain, since we are looking for
5504      the loop exit phi node.  */
5505   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5506     {
5507       stmt_vec_info dest_stmt_info
5508         = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
5509       scalar_dest = gimple_assign_lhs (dest_stmt_info->stmt);
5510       group_size = 1;
5511     }
5512
5513   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5514      case that REDUC_GROUP_SIZE is greater than vectorization factor).
5515      Therefore, we need to match SCALAR_RESULTS with corresponding statements.
5516      The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results
5517      correspond to the first vector stmt, etc.
5518      (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)).  */
5519   if (group_size > new_phis.length ())
5520     gcc_assert (!(group_size % new_phis.length ()));
5521
5522   for (k = 0; k < group_size; k++)
5523     {
5524       if (slp_reduc)
5525         {
5526           stmt_vec_info scalar_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5527
5528           orig_stmt_info = STMT_VINFO_RELATED_STMT (scalar_stmt_info);
5529           /* SLP statements can't participate in patterns.  */
5530           gcc_assert (!orig_stmt_info);
5531           scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt);
5532         }
5533
5534       if (nested_in_vect_loop)
5535         {
5536           if (double_reduc)
5537             loop = outer_loop;
5538           else
5539             gcc_unreachable ();
5540         }
5541
5542       phis.create (3);
5543       /* Find the loop-closed-use at the loop exit of the original scalar
5544          result.  (The reduction result is expected to have two immediate uses,
5545          one at the latch block, and one at the loop exit).  For double
5546          reductions we are looking for exit phis of the outer loop.  */
5547       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5548         {
5549           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5550             {
5551               if (!is_gimple_debug (USE_STMT (use_p)))
5552                 phis.safe_push (USE_STMT (use_p));
5553             }
5554           else
5555             {
5556               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5557                 {
5558                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
5559
5560                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5561                     {
5562                       if (!flow_bb_inside_loop_p (loop,
5563                                              gimple_bb (USE_STMT (phi_use_p)))
5564                           && !is_gimple_debug (USE_STMT (phi_use_p)))
5565                         phis.safe_push (USE_STMT (phi_use_p));
5566                     }
5567                 }
5568             }
5569         }
5570
5571       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5572         {
5573           /* Replace the uses:  */
5574           orig_name = PHI_RESULT (exit_phi);
5575           scalar_result = scalar_results[k];
5576           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5577             {
5578               FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5579                 SET_USE (use_p, scalar_result);
5580               update_stmt (use_stmt);
5581             }
5582         }
5583
5584       phis.release ();
5585     }
5586 }
5587
5588 /* Return a vector of type VECTYPE that is equal to the vector select
5589    operation "MASK ? VEC : IDENTITY".  Insert the select statements
5590    before GSI.  */
5591
5592 static tree
5593 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
5594                      tree vec, tree identity)
5595 {
5596   tree cond = make_temp_ssa_name (vectype, NULL, "cond");
5597   gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
5598                                           mask, vec, identity);
5599   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5600   return cond;
5601 }
5602
5603 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
5604    order, starting with LHS.  Insert the extraction statements before GSI and
5605    associate the new scalar SSA names with variable SCALAR_DEST.
5606    Return the SSA name for the result.  */
5607
5608 static tree
5609 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
5610                        tree_code code, tree lhs, tree vector_rhs)
5611 {
5612   tree vectype = TREE_TYPE (vector_rhs);
5613   tree scalar_type = TREE_TYPE (vectype);
5614   tree bitsize = TYPE_SIZE (scalar_type);
5615   unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5616   unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
5617
5618   for (unsigned HOST_WIDE_INT bit_offset = 0;
5619        bit_offset < vec_size_in_bits;
5620        bit_offset += element_bitsize)
5621     {
5622       tree bitpos = bitsize_int (bit_offset);
5623       tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
5624                          bitsize, bitpos);
5625
5626       gassign *stmt = gimple_build_assign (scalar_dest, rhs);
5627       rhs = make_ssa_name (scalar_dest, stmt);
5628       gimple_assign_set_lhs (stmt, rhs);
5629       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5630
5631       stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
5632       tree new_name = make_ssa_name (scalar_dest, stmt);
5633       gimple_assign_set_lhs (stmt, new_name);
5634       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5635       lhs = new_name;
5636     }
5637   return lhs;
5638 }
5639
5640 /* Get a masked internal function equivalent to REDUC_FN.  VECTYPE_IN is the
5641    type of the vector input.  */
5642
5643 static internal_fn
5644 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
5645 {
5646   internal_fn mask_reduc_fn;
5647
5648   switch (reduc_fn)
5649     {
5650     case IFN_FOLD_LEFT_PLUS:
5651       mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
5652       break;
5653
5654     default:
5655       return IFN_LAST;
5656     }
5657
5658   if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
5659                                       OPTIMIZE_FOR_SPEED))
5660     return mask_reduc_fn;
5661   return IFN_LAST;
5662 }
5663
5664 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT_INFO is the
5665    statement that sets the live-out value.  REDUC_DEF_STMT is the phi
5666    statement.  CODE is the operation performed by STMT_INFO and OPS are
5667    its scalar operands.  REDUC_INDEX is the index of the operand in
5668    OPS that is set by REDUC_DEF_STMT.  REDUC_FN is the function that
5669    implements in-order reduction, or IFN_LAST if we should open-code it.
5670    VECTYPE_IN is the type of the vector input.  MASKS specifies the masks
5671    that should be used to control the operation in a fully-masked loop.  */
5672
5673 static bool
5674 vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
5675                                stmt_vec_info stmt_info,
5676                                gimple_stmt_iterator *gsi,
5677                                stmt_vec_info *vec_stmt, slp_tree slp_node,
5678                                gimple *reduc_def_stmt,
5679                                tree_code code, internal_fn reduc_fn,
5680                                tree ops[3], tree vectype_in,
5681                                int reduc_index, vec_loop_masks *masks)
5682 {
5683   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5684   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5685   stmt_vec_info new_stmt_info = NULL;
5686   internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
5687
5688   int ncopies;
5689   if (slp_node)
5690     ncopies = 1;
5691   else
5692     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
5693
5694   gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
5695   gcc_assert (ncopies == 1);
5696   gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
5697
5698   if (slp_node)
5699     gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
5700                           TYPE_VECTOR_SUBPARTS (vectype_in)));
5701
5702   tree op0 = ops[1 - reduc_index];
5703
5704   int group_size = 1;
5705   stmt_vec_info scalar_dest_def_info;
5706   auto_vec<tree> vec_oprnds0;
5707   if (slp_node)
5708     {
5709       auto_vec<vec<tree> > vec_defs (2);
5710       vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs);
5711       vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
5712       vec_defs[0].release ();
5713       vec_defs[1].release ();
5714       group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
5715       scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5716     }
5717   else
5718     {
5719       tree loop_vec_def0 = vect_get_vec_def_for_operand (loop_vinfo,
5720                                                          op0, stmt_info);
5721       vec_oprnds0.create (1);
5722       vec_oprnds0.quick_push (loop_vec_def0);
5723       scalar_dest_def_info = stmt_info;
5724     }
5725
5726   tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
5727   tree scalar_type = TREE_TYPE (scalar_dest);
5728   tree reduc_var = gimple_phi_result (reduc_def_stmt);
5729
5730   int vec_num = vec_oprnds0.length ();
5731   gcc_assert (vec_num == 1 || slp_node);
5732   tree vec_elem_type = TREE_TYPE (vectype_out);
5733   gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
5734
5735   tree vector_identity = NULL_TREE;
5736   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5737     vector_identity = build_zero_cst (vectype_out);
5738
5739   tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
5740   int i;
5741   tree def0;
5742   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5743     {
5744       gimple *new_stmt;
5745       tree mask = NULL_TREE;
5746       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5747         mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
5748
5749       /* Handle MINUS by adding the negative.  */
5750       if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
5751         {
5752           tree negated = make_ssa_name (vectype_out);
5753           new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
5754           gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5755           def0 = negated;
5756         }
5757
5758       if (mask && mask_reduc_fn == IFN_LAST)
5759         def0 = merge_with_identity (gsi, mask, vectype_out, def0,
5760                                     vector_identity);
5761
5762       /* On the first iteration the input is simply the scalar phi
5763          result, and for subsequent iterations it is the output of
5764          the preceding operation.  */
5765       if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
5766         {
5767           if (mask && mask_reduc_fn != IFN_LAST)
5768             new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
5769                                                    def0, mask);
5770           else
5771             new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
5772                                                    def0);
5773           /* For chained SLP reductions the output of the previous reduction
5774              operation serves as the input of the next. For the final statement
5775              the output cannot be a temporary - we reuse the original
5776              scalar destination of the last statement.  */
5777           if (i != vec_num - 1)
5778             {
5779               gimple_set_lhs (new_stmt, scalar_dest_var);
5780               reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
5781               gimple_set_lhs (new_stmt, reduc_var);
5782             }
5783         }
5784       else
5785         {
5786           reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
5787                                              reduc_var, def0);
5788           new_stmt = SSA_NAME_DEF_STMT (reduc_var);
5789           /* Remove the statement, so that we can use the same code paths
5790              as for statements that we've just created.  */
5791           gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
5792           gsi_remove (&tmp_gsi, true);
5793         }
5794
5795       if (i == vec_num - 1)
5796         {
5797           gimple_set_lhs (new_stmt, scalar_dest);
5798           new_stmt_info = vect_finish_replace_stmt (loop_vinfo,
5799                                                     scalar_dest_def_info,
5800                                                     new_stmt);
5801         }
5802       else
5803         new_stmt_info = vect_finish_stmt_generation (loop_vinfo,
5804                                                      scalar_dest_def_info,
5805                                                      new_stmt, gsi);
5806
5807       if (slp_node)
5808         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
5809     }
5810
5811   if (!slp_node)
5812     STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
5813
5814   return true;
5815 }
5816
5817 /* Function is_nonwrapping_integer_induction.
5818
5819    Check if STMT_VINO (which is part of loop LOOP) both increments and
5820    does not cause overflow.  */
5821
5822 static bool
5823 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
5824 {
5825   gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
5826   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
5827   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
5828   tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
5829   widest_int ni, max_loop_value, lhs_max;
5830   wi::overflow_type overflow = wi::OVF_NONE;
5831
5832   /* Make sure the loop is integer based.  */
5833   if (TREE_CODE (base) != INTEGER_CST
5834       || TREE_CODE (step) != INTEGER_CST)
5835     return false;
5836
5837   /* Check that the max size of the loop will not wrap.  */
5838
5839   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
5840     return true;
5841
5842   if (! max_stmt_executions (loop, &ni))
5843     return false;
5844
5845   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
5846                             &overflow);
5847   if (overflow)
5848     return false;
5849
5850   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
5851                             TYPE_SIGN (lhs_type), &overflow);
5852   if (overflow)
5853     return false;
5854
5855   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
5856           <= TYPE_PRECISION (lhs_type));
5857 }
5858
5859 /* Check if masking can be supported by inserting a conditional expression.
5860    CODE is the code for the operation.  COND_FN is the conditional internal
5861    function, if it exists.  VECTYPE_IN is the type of the vector input.  */
5862 static bool
5863 use_mask_by_cond_expr_p (enum tree_code code, internal_fn cond_fn,
5864                          tree vectype_in)
5865 {
5866   if (cond_fn != IFN_LAST
5867       && direct_internal_fn_supported_p (cond_fn, vectype_in,
5868                                          OPTIMIZE_FOR_SPEED))
5869     return false;
5870
5871   switch (code)
5872     {
5873     case DOT_PROD_EXPR:
5874     case SAD_EXPR:
5875       return true;
5876
5877     default:
5878       return false;
5879     }
5880 }
5881
5882 /* Insert a conditional expression to enable masked vectorization.  CODE is the
5883    code for the operation.  VOP is the array of operands.  MASK is the loop
5884    mask.  GSI is a statement iterator used to place the new conditional
5885    expression.  */
5886 static void
5887 build_vect_cond_expr (enum tree_code code, tree vop[3], tree mask,
5888                       gimple_stmt_iterator *gsi)
5889 {
5890   switch (code)
5891     {
5892     case DOT_PROD_EXPR:
5893       {
5894         tree vectype = TREE_TYPE (vop[1]);
5895         tree zero = build_zero_cst (vectype);
5896         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
5897         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
5898                                                mask, vop[1], zero);
5899         gsi_insert_before (gsi, select, GSI_SAME_STMT);
5900         vop[1] = masked_op1;
5901         break;
5902       }
5903
5904     case SAD_EXPR:
5905       {
5906         tree vectype = TREE_TYPE (vop[1]);
5907         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
5908         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
5909                                                mask, vop[1], vop[0]);
5910         gsi_insert_before (gsi, select, GSI_SAME_STMT);
5911         vop[1] = masked_op1;
5912         break;
5913       }
5914
5915     default:
5916       gcc_unreachable ();
5917     }
5918 }
5919
5920 /* Function vectorizable_reduction.
5921
5922    Check if STMT_INFO performs a reduction operation that can be vectorized.
5923    If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
5924    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
5925    Return true if STMT_INFO is vectorizable in this way.
5926
5927    This function also handles reduction idioms (patterns) that have been
5928    recognized in advance during vect_pattern_recog.  In this case, STMT_INFO
5929    may be of this form:
5930      X = pattern_expr (arg0, arg1, ..., X)
5931    and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
5932    sequence that had been detected and replaced by the pattern-stmt
5933    (STMT_INFO).
5934
5935    This function also handles reduction of condition expressions, for example:
5936      for (int i = 0; i < N; i++)
5937        if (a[i] < value)
5938          last = a[i];
5939    This is handled by vectorising the loop and creating an additional vector
5940    containing the loop indexes for which "a[i] < value" was true.  In the
5941    function epilogue this is reduced to a single max value and then used to
5942    index into the vector of results.
5943
5944    In some cases of reduction patterns, the type of the reduction variable X is
5945    different than the type of the other arguments of STMT_INFO.
5946    In such cases, the vectype that is used when transforming STMT_INFO into
5947    a vector stmt is different than the vectype that is used to determine the
5948    vectorization factor, because it consists of a different number of elements
5949    than the actual number of elements that are being operated upon in parallel.
5950
5951    For example, consider an accumulation of shorts into an int accumulator.
5952    On some targets it's possible to vectorize this pattern operating on 8
5953    shorts at a time (hence, the vectype for purposes of determining the
5954    vectorization factor should be V8HI); on the other hand, the vectype that
5955    is used to create the vector form is actually V4SI (the type of the result).
5956
5957    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
5958    indicates what is the actual level of parallelism (V8HI in the example), so
5959    that the right vectorization factor would be derived.  This vectype
5960    corresponds to the type of arguments to the reduction stmt, and should *NOT*
5961    be used to create the vectorized stmt.  The right vectype for the vectorized
5962    stmt is obtained from the type of the result X:
5963       get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
5964
5965    This means that, contrary to "regular" reductions (or "regular" stmts in
5966    general), the following equation:
5967       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
5968    does *NOT* necessarily hold for reduction patterns.  */
5969
5970 bool
5971 vectorizable_reduction (loop_vec_info loop_vinfo,
5972                         stmt_vec_info stmt_info, slp_tree slp_node,
5973                         slp_instance slp_node_instance,
5974                         stmt_vector_for_cost *cost_vec)
5975 {
5976   tree scalar_dest;
5977   tree vectype_in = NULL_TREE;
5978   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5979   enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
5980   stmt_vec_info cond_stmt_vinfo = NULL;
5981   tree scalar_type;
5982   int i;
5983   int ncopies;
5984   bool single_defuse_cycle = false;
5985   bool nested_cycle = false;
5986   bool double_reduc = false;
5987   int vec_num;
5988   tree tem;
5989   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
5990   tree cond_reduc_val = NULL_TREE;
5991
5992   /* Make sure it was already recognized as a reduction computation.  */
5993   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
5994       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
5995       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
5996     return false;
5997
5998   /* The stmt we store reduction analysis meta on.  */
5999   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
6000   reduc_info->is_reduc_info = true;
6001
6002   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
6003     {
6004       if (is_a <gphi *> (stmt_info->stmt))
6005         /* Analysis for double-reduction is done on the outer
6006            loop PHI, nested cycles have no further restrictions.  */
6007         STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
6008       else
6009         STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6010       return true;
6011     }
6012
6013   stmt_vec_info orig_stmt_of_analysis = stmt_info;
6014   stmt_vec_info phi_info = stmt_info;
6015   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
6016       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
6017     {
6018       if (!is_a <gphi *> (stmt_info->stmt))
6019         {
6020           STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6021           return true;
6022         }
6023       if (slp_node)
6024         {
6025           slp_node_instance->reduc_phis = slp_node;
6026           /* ???  We're leaving slp_node to point to the PHIs, we only
6027              need it to get at the number of vector stmts which wasn't
6028              yet initialized for the instance root.  */
6029         }
6030       if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
6031         stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (stmt_info));
6032       else /* STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def */
6033         {
6034           use_operand_p use_p;
6035           gimple *use_stmt;
6036           bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
6037                                      &use_p, &use_stmt);
6038           gcc_assert (res);
6039           phi_info = loop_vinfo->lookup_stmt (use_stmt);
6040           stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
6041         }
6042     }
6043
6044   /* PHIs should not participate in patterns.  */
6045   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6046   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
6047
6048   /* Verify following REDUC_IDX from the latch def leads us back to the PHI
6049      and compute the reduction chain length.  */
6050   tree reduc_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
6051                                           loop_latch_edge (loop));
6052   unsigned reduc_chain_length = 0;
6053   bool only_slp_reduc_chain = true;
6054   stmt_info = NULL;
6055   while (reduc_def != PHI_RESULT (reduc_def_phi))
6056     {
6057       stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
6058       stmt_vec_info vdef = vect_stmt_to_vectorize (def);
6059       if (STMT_VINFO_REDUC_IDX (vdef) == -1)
6060         {
6061           if (dump_enabled_p ())
6062             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6063                              "reduction chain broken by patterns.\n");
6064           return false;
6065         }
6066       if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
6067         only_slp_reduc_chain = false;
6068       /* ???  For epilogue generation live members of the chain need
6069          to point back to the PHI via their original stmt for
6070          info_for_reduction to work.  */
6071       if (STMT_VINFO_LIVE_P (vdef))
6072         STMT_VINFO_REDUC_DEF (def) = phi_info;
6073       gassign *assign = dyn_cast <gassign *> (vdef->stmt);
6074       if (!assign)
6075         {
6076           if (dump_enabled_p ())
6077             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6078                              "reduction chain includes calls.\n");
6079           return false;
6080         }
6081       if (CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign)))
6082         {
6083           if (!tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (assign)),
6084                                       TREE_TYPE (gimple_assign_rhs1 (assign))))
6085             {
6086               if (dump_enabled_p ())
6087                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6088                                  "conversion in the reduction chain.\n");
6089               return false;
6090             }
6091         }
6092       else if (!stmt_info)
6093         /* First non-conversion stmt.  */
6094         stmt_info = vdef;
6095       reduc_def = gimple_op (vdef->stmt, 1 + STMT_VINFO_REDUC_IDX (vdef));
6096       reduc_chain_length++;
6097     }
6098   /* PHIs should not participate in patterns.  */
6099   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6100
6101   if (nested_in_vect_loop_p (loop, stmt_info))
6102     {
6103       loop = loop->inner;
6104       nested_cycle = true;
6105     }
6106
6107   /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
6108      element.  */
6109   if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6110     {
6111       gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
6112       stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
6113     }
6114   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6115     gcc_assert (slp_node
6116                 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
6117
6118   /* 1. Is vectorizable reduction?  */
6119   /* Not supportable if the reduction variable is used in the loop, unless
6120      it's a reduction chain.  */
6121   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6122       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6123     return false;
6124
6125   /* Reductions that are not used even in an enclosing outer-loop,
6126      are expected to be "live" (used out of the loop).  */
6127   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6128       && !STMT_VINFO_LIVE_P (stmt_info))
6129     return false;
6130
6131   /* 2. Has this been recognized as a reduction pattern?
6132
6133      Check if STMT represents a pattern that has been recognized
6134      in earlier analysis stages.  For stmts that represent a pattern,
6135      the STMT_VINFO_RELATED_STMT field records the last stmt in
6136      the original sequence that constitutes the pattern.  */
6137
6138   stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6139   if (orig_stmt_info)
6140     {
6141       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6142       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6143     }
6144
6145   /* 3. Check the operands of the operation.  The first operands are defined
6146         inside the loop body. The last operand is the reduction variable,
6147         which is defined by the loop-header-phi.  */
6148
6149   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6150   STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
6151   gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6152   enum tree_code code = gimple_assign_rhs_code (stmt);
6153   bool lane_reduc_code_p
6154     = (code == DOT_PROD_EXPR || code == WIDEN_SUM_EXPR || code == SAD_EXPR);
6155   int op_type = TREE_CODE_LENGTH (code);
6156
6157   scalar_dest = gimple_assign_lhs (stmt);
6158   scalar_type = TREE_TYPE (scalar_dest);
6159   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6160       && !SCALAR_FLOAT_TYPE_P (scalar_type))
6161     return false;
6162
6163   /* Do not try to vectorize bit-precision reductions.  */
6164   if (!type_has_mode_precision_p (scalar_type))
6165     return false;
6166
6167   /* For lane-reducing ops we're reducing the number of reduction PHIs
6168      which means the only use of that may be in the lane-reducing operation.  */
6169   if (lane_reduc_code_p
6170       && reduc_chain_length != 1
6171       && !only_slp_reduc_chain)
6172     {
6173       if (dump_enabled_p ())
6174         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6175                          "lane-reducing reduction with extra stmts.\n");
6176       return false;
6177     }
6178
6179   /* All uses but the last are expected to be defined in the loop.
6180      The last use is the reduction variable.  In case of nested cycle this
6181      assumption is not true: we use reduc_index to record the index of the
6182      reduction variable.  */
6183   reduc_def = PHI_RESULT (reduc_def_phi);
6184   for (i = 0; i < op_type; i++)
6185     {
6186       tree op = gimple_op (stmt, i + 1);
6187       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
6188       if (i == 0 && code == COND_EXPR)
6189         continue;
6190
6191       stmt_vec_info def_stmt_info;
6192       enum vect_def_type dt;
6193       if (!vect_is_simple_use (op, loop_vinfo, &dt, &tem,
6194                                &def_stmt_info))
6195         {
6196           if (dump_enabled_p ())
6197             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6198                              "use not simple.\n");
6199           return false;
6200         }
6201       if (i == STMT_VINFO_REDUC_IDX (stmt_info))
6202         continue;
6203
6204       /* There should be only one cycle def in the stmt, the one
6205          leading to reduc_def.  */
6206       if (VECTORIZABLE_CYCLE_DEF (dt))
6207         return false;
6208
6209       /* To properly compute ncopies we are interested in the widest
6210          non-reduction input type in case we're looking at a widening
6211          accumulation that we later handle in vect_transform_reduction.  */
6212       if (lane_reduc_code_p
6213           && tem
6214           && (!vectype_in
6215               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6216                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem))))))
6217         vectype_in = tem;
6218
6219       if (code == COND_EXPR)
6220         {
6221           /* Record how the non-reduction-def value of COND_EXPR is defined.  */
6222           if (dt == vect_constant_def)
6223             {
6224               cond_reduc_dt = dt;
6225               cond_reduc_val = op;
6226             }
6227           if (dt == vect_induction_def
6228               && def_stmt_info
6229               && is_nonwrapping_integer_induction (def_stmt_info, loop))
6230             {
6231               cond_reduc_dt = dt;
6232               cond_stmt_vinfo = def_stmt_info;
6233             }
6234         }
6235     }
6236   if (!vectype_in)
6237     vectype_in = STMT_VINFO_VECTYPE (phi_info);
6238   STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
6239
6240   enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
6241   STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
6242   /* If we have a condition reduction, see if we can simplify it further.  */
6243   if (v_reduc_type == COND_REDUCTION)
6244     {
6245       if (slp_node)
6246         return false;
6247
6248       /* When the condition uses the reduction value in the condition, fail.  */
6249       if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
6250         {
6251           if (dump_enabled_p ())
6252             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6253                              "condition depends on previous iteration\n");
6254           return false;
6255         }
6256
6257       if (reduc_chain_length == 1
6258           && direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6259                                              vectype_in, OPTIMIZE_FOR_SPEED))
6260         {
6261           if (dump_enabled_p ())
6262             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6263                              "optimizing condition reduction with"
6264                              " FOLD_EXTRACT_LAST.\n");
6265           STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
6266         }
6267       else if (cond_reduc_dt == vect_induction_def)
6268         {
6269           tree base
6270             = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6271           tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6272
6273           gcc_assert (TREE_CODE (base) == INTEGER_CST
6274                       && TREE_CODE (step) == INTEGER_CST);
6275           cond_reduc_val = NULL_TREE;
6276           enum tree_code cond_reduc_op_code = ERROR_MARK;
6277           tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
6278           if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
6279             ;
6280           /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6281              above base; punt if base is the minimum value of the type for
6282              MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
6283           else if (tree_int_cst_sgn (step) == -1)
6284             {
6285               cond_reduc_op_code = MIN_EXPR;
6286               if (tree_int_cst_sgn (base) == -1)
6287                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6288               else if (tree_int_cst_lt (base,
6289                                         TYPE_MAX_VALUE (TREE_TYPE (base))))
6290                 cond_reduc_val
6291                   = int_const_binop (PLUS_EXPR, base, integer_one_node);
6292             }
6293           else
6294             {
6295               cond_reduc_op_code = MAX_EXPR;
6296               if (tree_int_cst_sgn (base) == 1)
6297                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6298               else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6299                                         base))
6300                 cond_reduc_val
6301                   = int_const_binop (MINUS_EXPR, base, integer_one_node);
6302             }
6303           if (cond_reduc_val)
6304             {
6305               if (dump_enabled_p ())
6306                 dump_printf_loc (MSG_NOTE, vect_location,
6307                                  "condition expression based on "
6308                                  "integer induction.\n");
6309               STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
6310               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
6311                 = cond_reduc_val;
6312               STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
6313             }
6314         }
6315       else if (cond_reduc_dt == vect_constant_def)
6316         {
6317           enum vect_def_type cond_initial_dt;
6318           tree cond_initial_val
6319             = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi, loop_preheader_edge (loop));
6320
6321           gcc_assert (cond_reduc_val != NULL_TREE);
6322           vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
6323           if (cond_initial_dt == vect_constant_def
6324               && types_compatible_p (TREE_TYPE (cond_initial_val),
6325                                      TREE_TYPE (cond_reduc_val)))
6326             {
6327               tree e = fold_binary (LE_EXPR, boolean_type_node,
6328                                     cond_initial_val, cond_reduc_val);
6329               if (e && (integer_onep (e) || integer_zerop (e)))
6330                 {
6331                   if (dump_enabled_p ())
6332                     dump_printf_loc (MSG_NOTE, vect_location,
6333                                      "condition expression based on "
6334                                      "compile time constant.\n");
6335                   /* Record reduction code at analysis stage.  */
6336                   STMT_VINFO_REDUC_CODE (reduc_info)
6337                     = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6338                   STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
6339                 }
6340             }
6341         }
6342     }
6343
6344   if (STMT_VINFO_LIVE_P (phi_info))
6345     return false;
6346
6347   if (slp_node)
6348     ncopies = 1;
6349   else
6350     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6351
6352   gcc_assert (ncopies >= 1);
6353
6354   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6355
6356   if (nested_cycle)
6357     {
6358       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
6359                   == vect_double_reduction_def);
6360       double_reduc = true;
6361     }
6362
6363   /* 4.2. Check support for the epilog operation.
6364
6365           If STMT represents a reduction pattern, then the type of the
6366           reduction variable may be different than the type of the rest
6367           of the arguments.  For example, consider the case of accumulation
6368           of shorts into an int accumulator; The original code:
6369                         S1: int_a = (int) short_a;
6370           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
6371
6372           was replaced with:
6373                         STMT: int_acc = widen_sum <short_a, int_acc>
6374
6375           This means that:
6376           1. The tree-code that is used to create the vector operation in the
6377              epilog code (that reduces the partial results) is not the
6378              tree-code of STMT, but is rather the tree-code of the original
6379              stmt from the pattern that STMT is replacing.  I.e, in the example
6380              above we want to use 'widen_sum' in the loop, but 'plus' in the
6381              epilog.
6382           2. The type (mode) we use to check available target support
6383              for the vector operation to be created in the *epilog*, is
6384              determined by the type of the reduction variable (in the example
6385              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6386              However the type (mode) we use to check available target support
6387              for the vector operation to be created *inside the loop*, is
6388              determined by the type of the other arguments to STMT (in the
6389              example we'd check this: optab_handler (widen_sum_optab,
6390              vect_short_mode)).
6391
6392           This is contrary to "regular" reductions, in which the types of all
6393           the arguments are the same as the type of the reduction variable.
6394           For "regular" reductions we can therefore use the same vector type
6395           (and also the same tree-code) when generating the epilog code and
6396           when generating the code inside the loop.  */
6397
6398   enum tree_code orig_code = STMT_VINFO_REDUC_CODE (phi_info);
6399   STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
6400
6401   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
6402   if (reduction_type == TREE_CODE_REDUCTION)
6403     {
6404       /* Check whether it's ok to change the order of the computation.
6405          Generally, when vectorizing a reduction we change the order of the
6406          computation.  This may change the behavior of the program in some
6407          cases, so we need to check that this is ok.  One exception is when
6408          vectorizing an outer-loop: the inner-loop is executed sequentially,
6409          and therefore vectorizing reductions in the inner-loop during
6410          outer-loop vectorization is safe.  */
6411       if (needs_fold_left_reduction_p (scalar_type, orig_code))
6412         {
6413           /* When vectorizing a reduction chain w/o SLP the reduction PHI
6414              is not directy used in stmt.  */
6415           if (!only_slp_reduc_chain
6416               && reduc_chain_length != 1)
6417             {
6418               if (dump_enabled_p ())
6419                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6420                                  "in-order reduction chain without SLP.\n");
6421               return false;
6422             }
6423           STMT_VINFO_REDUC_TYPE (reduc_info)
6424             = reduction_type = FOLD_LEFT_REDUCTION;
6425         }
6426       else if (!commutative_tree_code (orig_code)
6427                || !associative_tree_code (orig_code))
6428         {
6429           if (dump_enabled_p ())
6430             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6431                             "reduction: not commutative/associative");
6432           return false;
6433         }
6434     }
6435
6436   if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
6437       && ncopies > 1)
6438     {
6439       if (dump_enabled_p ())
6440         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6441                          "multiple types in double reduction or condition "
6442                          "reduction or fold-left reduction.\n");
6443       return false;
6444     }
6445
6446   internal_fn reduc_fn = IFN_LAST;
6447   if (reduction_type == TREE_CODE_REDUCTION
6448       || reduction_type == FOLD_LEFT_REDUCTION
6449       || reduction_type == INTEGER_INDUC_COND_REDUCTION
6450       || reduction_type == CONST_COND_REDUCTION)
6451     {
6452       if (reduction_type == FOLD_LEFT_REDUCTION
6453           ? fold_left_reduction_fn (orig_code, &reduc_fn)
6454           : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6455         {
6456           if (reduc_fn != IFN_LAST
6457               && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6458                                                   OPTIMIZE_FOR_SPEED))
6459             {
6460               if (dump_enabled_p ())
6461                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6462                                  "reduc op not supported by target.\n");
6463
6464               reduc_fn = IFN_LAST;
6465             }
6466         }
6467       else
6468         {
6469           if (!nested_cycle || double_reduc)
6470             {
6471               if (dump_enabled_p ())
6472                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6473                                  "no reduc code for scalar code.\n");
6474
6475               return false;
6476             }
6477         }
6478     }
6479   else if (reduction_type == COND_REDUCTION)
6480     {
6481       int scalar_precision
6482         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6483       cr_index_scalar_type = make_unsigned_type (scalar_precision);
6484       cr_index_vector_type = build_vector_type (cr_index_scalar_type,
6485                                                 nunits_out);
6486
6487       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
6488                                           OPTIMIZE_FOR_SPEED))
6489         reduc_fn = IFN_REDUC_MAX;
6490     }
6491   STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
6492
6493   if (reduction_type != EXTRACT_LAST_REDUCTION
6494       && (!nested_cycle || double_reduc)
6495       && reduc_fn == IFN_LAST
6496       && !nunits_out.is_constant ())
6497     {
6498       if (dump_enabled_p ())
6499         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6500                          "missing target support for reduction on"
6501                          " variable-length vectors.\n");
6502       return false;
6503     }
6504
6505   /* For SLP reductions, see if there is a neutral value we can use.  */
6506   tree neutral_op = NULL_TREE;
6507   if (slp_node)
6508     neutral_op = neutral_op_for_slp_reduction
6509       (slp_node_instance->reduc_phis, vectype_out, orig_code,
6510        REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL);
6511
6512   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
6513     {
6514       /* We can't support in-order reductions of code such as this:
6515
6516            for (int i = 0; i < n1; ++i)
6517              for (int j = 0; j < n2; ++j)
6518                l += a[j];
6519
6520          since GCC effectively transforms the loop when vectorizing:
6521
6522            for (int i = 0; i < n1 / VF; ++i)
6523              for (int j = 0; j < n2; ++j)
6524                for (int k = 0; k < VF; ++k)
6525                  l += a[j];
6526
6527          which is a reassociation of the original operation.  */
6528       if (dump_enabled_p ())
6529         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6530                          "in-order double reduction not supported.\n");
6531
6532       return false;
6533     }
6534
6535   if (reduction_type == FOLD_LEFT_REDUCTION
6536       && slp_node
6537       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6538     {
6539       /* We cannot use in-order reductions in this case because there is
6540          an implicit reassociation of the operations involved.  */
6541       if (dump_enabled_p ())
6542         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6543                          "in-order unchained SLP reductions not supported.\n");
6544       return false;
6545     }
6546
6547   /* For double reductions, and for SLP reductions with a neutral value,
6548      we construct a variable-length initial vector by loading a vector
6549      full of the neutral value and then shift-and-inserting the start
6550      values into the low-numbered elements.  */
6551   if ((double_reduc || neutral_op)
6552       && !nunits_out.is_constant ()
6553       && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
6554                                           vectype_out, OPTIMIZE_FOR_SPEED))
6555     {
6556       if (dump_enabled_p ())
6557         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6558                          "reduction on variable-length vectors requires"
6559                          " target support for a vector-shift-and-insert"
6560                          " operation.\n");
6561       return false;
6562     }
6563
6564   /* Check extra constraints for variable-length unchained SLP reductions.  */
6565   if (STMT_SLP_TYPE (stmt_info)
6566       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6567       && !nunits_out.is_constant ())
6568     {
6569       /* We checked above that we could build the initial vector when
6570          there's a neutral element value.  Check here for the case in
6571          which each SLP statement has its own initial value and in which
6572          that value needs to be repeated for every instance of the
6573          statement within the initial vector.  */
6574       unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
6575       if (!neutral_op
6576           && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
6577                                               TREE_TYPE (vectype_out)))
6578         {
6579           if (dump_enabled_p ())
6580             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6581                              "unsupported form of SLP reduction for"
6582                              " variable-length vectors: cannot build"
6583                              " initial vector.\n");
6584           return false;
6585         }
6586       /* The epilogue code relies on the number of elements being a multiple
6587          of the group size.  The duplicate-and-interleave approach to setting
6588          up the initial vector does too.  */
6589       if (!multiple_p (nunits_out, group_size))
6590         {
6591           if (dump_enabled_p ())
6592             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6593                              "unsupported form of SLP reduction for"
6594                              " variable-length vectors: the vector size"
6595                              " is not a multiple of the number of results.\n");
6596           return false;
6597         }
6598     }
6599
6600   if (reduction_type == COND_REDUCTION)
6601     {
6602       widest_int ni;
6603
6604       if (! max_loop_iterations (loop, &ni))
6605         {
6606           if (dump_enabled_p ())
6607             dump_printf_loc (MSG_NOTE, vect_location,
6608                              "loop count not known, cannot create cond "
6609                              "reduction.\n");
6610           return false;
6611         }
6612       /* Convert backedges to iterations.  */
6613       ni += 1;
6614
6615       /* The additional index will be the same type as the condition.  Check
6616          that the loop can fit into this less one (because we'll use up the
6617          zero slot for when there are no matches).  */
6618       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
6619       if (wi::geu_p (ni, wi::to_widest (max_index)))
6620         {
6621           if (dump_enabled_p ())
6622             dump_printf_loc (MSG_NOTE, vect_location,
6623                              "loop size is greater than data size.\n");
6624           return false;
6625         }
6626     }
6627
6628   /* In case the vectorization factor (VF) is bigger than the number
6629      of elements that we can fit in a vectype (nunits), we have to generate
6630      more than one vector stmt - i.e - we need to "unroll" the
6631      vector stmt by a factor VF/nunits.  For more details see documentation
6632      in vectorizable_operation.  */
6633
6634   /* If the reduction is used in an outer loop we need to generate
6635      VF intermediate results, like so (e.g. for ncopies=2):
6636         r0 = phi (init, r0)
6637         r1 = phi (init, r1)
6638         r0 = x0 + r0;
6639         r1 = x1 + r1;
6640     (i.e. we generate VF results in 2 registers).
6641     In this case we have a separate def-use cycle for each copy, and therefore
6642     for each copy we get the vector def for the reduction variable from the
6643     respective phi node created for this copy.
6644
6645     Otherwise (the reduction is unused in the loop nest), we can combine
6646     together intermediate results, like so (e.g. for ncopies=2):
6647         r = phi (init, r)
6648         r = x0 + r;
6649         r = x1 + r;
6650    (i.e. we generate VF/2 results in a single register).
6651    In this case for each copy we get the vector def for the reduction variable
6652    from the vectorized reduction operation generated in the previous iteration.
6653
6654    This only works when we see both the reduction PHI and its only consumer
6655    in vectorizable_reduction and there are no intermediate stmts
6656    participating.  */
6657   if (ncopies > 1
6658       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
6659       && reduc_chain_length == 1)
6660     single_defuse_cycle = true;
6661
6662   if (single_defuse_cycle || lane_reduc_code_p)
6663     {
6664       gcc_assert (code != COND_EXPR);
6665
6666       /* 4. Supportable by target?  */
6667       bool ok = true;
6668
6669       /* 4.1. check support for the operation in the loop  */
6670       optab optab = optab_for_tree_code (code, vectype_in, optab_vector);
6671       if (!optab)
6672         {
6673           if (dump_enabled_p ())
6674             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6675                              "no optab.\n");
6676           ok = false;
6677         }
6678
6679       machine_mode vec_mode = TYPE_MODE (vectype_in);
6680       if (ok && optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6681         {
6682           if (dump_enabled_p ())
6683             dump_printf (MSG_NOTE, "op not supported by target.\n");
6684           if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
6685               || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6686             ok = false;
6687           else
6688             if (dump_enabled_p ())
6689               dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6690         }
6691
6692       /* Worthwhile without SIMD support?  */
6693       if (ok
6694           && !VECTOR_MODE_P (TYPE_MODE (vectype_in))
6695           && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6696         {
6697           if (dump_enabled_p ())
6698             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6699                              "not worthwhile without SIMD support.\n");
6700           ok = false;
6701         }
6702
6703       /* lane-reducing operations have to go through vect_transform_reduction.
6704          For the other cases try without the single cycle optimization.  */
6705       if (!ok)
6706         {
6707           if (lane_reduc_code_p)
6708             return false;
6709           else
6710             single_defuse_cycle = false;
6711         }
6712     }
6713   STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
6714
6715   /* If the reduction stmt is one of the patterns that have lane
6716      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
6717   if ((ncopies > 1 && ! single_defuse_cycle)
6718       && lane_reduc_code_p)
6719     {
6720       if (dump_enabled_p ())
6721         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6722                          "multi def-use cycle not possible for lane-reducing "
6723                          "reduction operation\n");
6724       return false;
6725     }
6726
6727   if (slp_node)
6728     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6729   else
6730     vec_num = 1;
6731
6732   vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
6733                              reduction_type, ncopies, cost_vec);
6734   if (dump_enabled_p ()
6735       && reduction_type == FOLD_LEFT_REDUCTION)
6736     dump_printf_loc (MSG_NOTE, vect_location,
6737                      "using an in-order (fold-left) reduction.\n");
6738   STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
6739   /* All but single defuse-cycle optimized, lane-reducing and fold-left
6740      reductions go through their own vectorizable_* routines.  */
6741   if (!single_defuse_cycle
6742       && code != DOT_PROD_EXPR
6743       && code != WIDEN_SUM_EXPR
6744       && code != SAD_EXPR
6745       && reduction_type != FOLD_LEFT_REDUCTION)
6746     {
6747       stmt_vec_info tem
6748         = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
6749       if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
6750         {
6751           gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
6752           tem = REDUC_GROUP_FIRST_ELEMENT (tem);
6753         }
6754       STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
6755       STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
6756     }
6757   else if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
6758     {
6759       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
6760       internal_fn cond_fn = get_conditional_internal_fn (code);
6761
6762       if (reduction_type != FOLD_LEFT_REDUCTION
6763           && !use_mask_by_cond_expr_p (code, cond_fn, vectype_in)
6764           && (cond_fn == IFN_LAST
6765               || !direct_internal_fn_supported_p (cond_fn, vectype_in,
6766                                                   OPTIMIZE_FOR_SPEED)))
6767         {
6768           if (dump_enabled_p ())
6769             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6770                              "can't use a fully-masked loop because no"
6771                              " conditional operation is available.\n");
6772           LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
6773         }
6774       else if (reduction_type == FOLD_LEFT_REDUCTION
6775                && reduc_fn == IFN_LAST
6776                && !expand_vec_cond_expr_p (vectype_in,
6777                                            truth_type_for (vectype_in),
6778                                            SSA_NAME))
6779         {
6780           if (dump_enabled_p ())
6781             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6782                              "can't use a fully-masked loop because no"
6783                              " conditional operation is available.\n");
6784           LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
6785         }
6786       else
6787         vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
6788                                vectype_in, NULL);
6789     }
6790   return true;
6791 }
6792
6793 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
6794    value.  */
6795
6796 bool
6797 vect_transform_reduction (loop_vec_info loop_vinfo,
6798                           stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
6799                           stmt_vec_info *vec_stmt, slp_tree slp_node)
6800 {
6801   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6802   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6803   int i;
6804   int ncopies;
6805   int j;
6806   int vec_num;
6807
6808   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
6809   gcc_assert (reduc_info->is_reduc_info);
6810
6811   if (nested_in_vect_loop_p (loop, stmt_info))
6812     {
6813       loop = loop->inner;
6814       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
6815     }
6816
6817   gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6818   enum tree_code code = gimple_assign_rhs_code (stmt);
6819   int op_type = TREE_CODE_LENGTH (code);
6820
6821   /* Flatten RHS.  */
6822   tree ops[3];
6823   switch (get_gimple_rhs_class (code))
6824     {
6825     case GIMPLE_TERNARY_RHS:
6826       ops[2] = gimple_assign_rhs3 (stmt);
6827       /* Fall thru.  */
6828     case GIMPLE_BINARY_RHS:
6829       ops[0] = gimple_assign_rhs1 (stmt);
6830       ops[1] = gimple_assign_rhs2 (stmt);
6831       break;
6832     default:
6833       gcc_unreachable ();
6834     }
6835
6836   /* All uses but the last are expected to be defined in the loop.
6837      The last use is the reduction variable.  In case of nested cycle this
6838      assumption is not true: we use reduc_index to record the index of the
6839      reduction variable.  */
6840   stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
6841   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
6842   int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
6843   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
6844
6845   if (slp_node)
6846     {
6847       ncopies = 1;
6848       vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6849     }
6850   else
6851     {
6852       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6853       vec_num = 1;
6854     }
6855
6856   internal_fn cond_fn = get_conditional_internal_fn (code);
6857   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
6858   bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
6859
6860   /* Transform.  */
6861   stmt_vec_info new_stmt_info = NULL;
6862   stmt_vec_info prev_stmt_info;
6863   tree new_temp = NULL_TREE;
6864   auto_vec<tree> vec_oprnds0;
6865   auto_vec<tree> vec_oprnds1;
6866   auto_vec<tree> vec_oprnds2;
6867   tree def0;
6868
6869   if (dump_enabled_p ())
6870     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
6871
6872   /* FORNOW: Multiple types are not supported for condition.  */
6873   if (code == COND_EXPR)
6874     gcc_assert (ncopies == 1);
6875
6876   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
6877
6878   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
6879   if (reduction_type == FOLD_LEFT_REDUCTION)
6880     {
6881       internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
6882       return vectorize_fold_left_reduction
6883           (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, code,
6884            reduc_fn, ops, vectype_in, reduc_index, masks);
6885     }
6886
6887   bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
6888   gcc_assert (single_defuse_cycle
6889               || code == DOT_PROD_EXPR
6890               || code == WIDEN_SUM_EXPR
6891               || code == SAD_EXPR);
6892
6893   /* Create the destination vector  */
6894   tree scalar_dest = gimple_assign_lhs (stmt);
6895   tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6896
6897   prev_stmt_info = NULL;
6898   if (!slp_node)
6899     {
6900       vec_oprnds0.create (1);
6901       vec_oprnds1.create (1);
6902       if (op_type == ternary_op)
6903         vec_oprnds2.create (1);
6904     }
6905
6906   for (j = 0; j < ncopies; j++)
6907     {
6908       /* Handle uses.  */
6909       if (j == 0)
6910         {
6911           if (slp_node)
6912             {
6913               /* Get vec defs for all the operands except the reduction index,
6914                  ensuring the ordering of the ops in the vector is kept.  */
6915               auto_vec<vec<tree>, 3> vec_defs;
6916               vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs);
6917               vec_oprnds0.safe_splice (vec_defs[0]);
6918               vec_defs[0].release ();
6919               vec_oprnds1.safe_splice (vec_defs[1]);
6920               vec_defs[1].release ();
6921               if (op_type == ternary_op)
6922                 {
6923                   vec_oprnds2.safe_splice (vec_defs[2]);
6924                   vec_defs[2].release ();
6925                 }
6926             }
6927           else
6928             {
6929               vec_oprnds0.quick_push
6930                 (vect_get_vec_def_for_operand (loop_vinfo, ops[0], stmt_info));
6931               vec_oprnds1.quick_push
6932                 (vect_get_vec_def_for_operand (loop_vinfo, ops[1], stmt_info));
6933               if (op_type == ternary_op)
6934                 vec_oprnds2.quick_push
6935                   (vect_get_vec_def_for_operand (loop_vinfo, ops[2], stmt_info));
6936             }
6937         }
6938       else
6939         {
6940           if (!slp_node)
6941             {
6942               gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
6943
6944               if (single_defuse_cycle && reduc_index == 0)
6945                 vec_oprnds0[0] = gimple_get_lhs (new_stmt_info->stmt);
6946               else
6947                 vec_oprnds0[0]
6948                   = vect_get_vec_def_for_stmt_copy (loop_vinfo,
6949                                                     vec_oprnds0[0]);
6950               if (single_defuse_cycle && reduc_index == 1)
6951                 vec_oprnds1[0] = gimple_get_lhs (new_stmt_info->stmt);
6952               else
6953                 vec_oprnds1[0]
6954                   = vect_get_vec_def_for_stmt_copy (loop_vinfo,
6955                                                     vec_oprnds1[0]);
6956               if (op_type == ternary_op)
6957                 {
6958                   if (single_defuse_cycle && reduc_index == 2)
6959                     vec_oprnds2[0] = gimple_get_lhs (new_stmt_info->stmt);
6960                   else
6961                     vec_oprnds2[0]
6962                       = vect_get_vec_def_for_stmt_copy (loop_vinfo,
6963                                                         vec_oprnds2[0]);
6964                 }
6965             }
6966         }
6967
6968       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6969         {
6970           tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
6971           if (masked_loop_p && !mask_by_cond_expr)
6972             {
6973               /* Make sure that the reduction accumulator is vop[0].  */
6974               if (reduc_index == 1)
6975                 {
6976                   gcc_assert (commutative_tree_code (code));
6977                   std::swap (vop[0], vop[1]);
6978                 }
6979               tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
6980                                               vectype_in, i * ncopies + j);
6981               gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
6982                                                         vop[0], vop[1],
6983                                                         vop[0]);
6984               new_temp = make_ssa_name (vec_dest, call);
6985               gimple_call_set_lhs (call, new_temp);
6986               gimple_call_set_nothrow (call, true);
6987               new_stmt_info
6988                 = vect_finish_stmt_generation (loop_vinfo,
6989                                                stmt_info, call, gsi);
6990             }
6991           else
6992             {
6993               if (op_type == ternary_op)
6994                 vop[2] = vec_oprnds2[i];
6995
6996               if (masked_loop_p && mask_by_cond_expr)
6997                 {
6998                   tree mask = vect_get_loop_mask (gsi, masks,
6999                                                   vec_num * ncopies,
7000                                                   vectype_in, i * ncopies + j);
7001                   build_vect_cond_expr (code, vop, mask, gsi);
7002                 }
7003
7004               gassign *new_stmt = gimple_build_assign (vec_dest, code,
7005                                                        vop[0], vop[1], vop[2]);
7006               new_temp = make_ssa_name (vec_dest, new_stmt);
7007               gimple_assign_set_lhs (new_stmt, new_temp);
7008               new_stmt_info
7009                 = vect_finish_stmt_generation (loop_vinfo,
7010                                                stmt_info, new_stmt, gsi);
7011             }
7012
7013           if (slp_node)
7014             SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
7015         }
7016
7017       if (slp_node || single_defuse_cycle)
7018         continue;
7019
7020       if (j == 0)
7021         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
7022       else
7023         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
7024
7025       prev_stmt_info = new_stmt_info;
7026     }
7027
7028   if (single_defuse_cycle && !slp_node)
7029     STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
7030
7031   return true;
7032 }
7033
7034 /* Transform phase of a cycle PHI.  */
7035
7036 bool
7037 vect_transform_cycle_phi (loop_vec_info loop_vinfo,
7038                           stmt_vec_info stmt_info, stmt_vec_info *vec_stmt,
7039                           slp_tree slp_node, slp_instance slp_node_instance)
7040 {
7041   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7042   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7043   int i;
7044   int ncopies;
7045   stmt_vec_info prev_phi_info;
7046   int j;
7047   bool nested_cycle = false;
7048   int vec_num;
7049
7050   if (nested_in_vect_loop_p (loop, stmt_info))
7051     {
7052       loop = loop->inner;
7053       nested_cycle = true;
7054     }
7055
7056   stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
7057   reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
7058   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7059   gcc_assert (reduc_info->is_reduc_info);
7060
7061   if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
7062       || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
7063     /* Leave the scalar phi in place.  */
7064     return true;
7065
7066   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7067   /* For a nested cycle we do not fill the above.  */
7068   if (!vectype_in)
7069     vectype_in = STMT_VINFO_VECTYPE (stmt_info);
7070   gcc_assert (vectype_in);
7071
7072   if (slp_node)
7073     {
7074       /* The size vect_schedule_slp_instance computes is off for us.  */
7075       vec_num = vect_get_num_vectors
7076           (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
7077            * SLP_TREE_SCALAR_STMTS (slp_node).length (), vectype_in);
7078       ncopies = 1;
7079     }
7080   else
7081     {
7082       vec_num = 1;
7083       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7084     }
7085
7086   /* Check whether we should use a single PHI node and accumulate
7087      vectors to one before the backedge.  */
7088   if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
7089     ncopies = 1;
7090
7091   /* Create the destination vector  */
7092   gphi *phi = as_a <gphi *> (stmt_info->stmt);
7093   tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
7094                                                vectype_out);
7095
7096   /* Get the loop-entry arguments.  */
7097   tree vec_initial_def;
7098   auto_vec<tree> vec_initial_defs;
7099   if (slp_node)
7100     {
7101       vec_initial_defs.reserve (vec_num);
7102       gcc_assert (slp_node == slp_node_instance->reduc_phis);
7103       stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info);
7104       tree neutral_op
7105         = neutral_op_for_slp_reduction (slp_node, vectype_out,
7106                                         STMT_VINFO_REDUC_CODE (reduc_info),
7107                                         first != NULL);
7108       get_initial_defs_for_reduction (loop_vinfo, slp_node_instance->reduc_phis,
7109                                       &vec_initial_defs, vec_num,
7110                                       first != NULL, neutral_op);
7111     }
7112   else
7113     {
7114       /* Get at the scalar def before the loop, that defines the initial
7115          value of the reduction variable.  */
7116       tree initial_def = PHI_ARG_DEF_FROM_EDGE (phi,
7117                                                 loop_preheader_edge (loop));
7118       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
7119          and we can't use zero for induc_val, use initial_def.  Similarly
7120          for REDUC_MIN and initial_def larger than the base.  */
7121       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
7122         {
7123           tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
7124           if (TREE_CODE (initial_def) == INTEGER_CST
7125               && !integer_zerop (induc_val)
7126               && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
7127                    && tree_int_cst_lt (initial_def, induc_val))
7128                   || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
7129                       && tree_int_cst_lt (induc_val, initial_def))))
7130             {
7131               induc_val = initial_def;
7132               /* Communicate we used the initial_def to epilouge
7133                  generation.  */
7134               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
7135             }
7136           vec_initial_def = build_vector_from_val (vectype_out, induc_val);
7137         }
7138       else if (nested_cycle)
7139         {
7140           /* Do not use an adjustment def as that case is not supported
7141              correctly if ncopies is not one.  */
7142           vec_initial_def = vect_get_vec_def_for_operand (loop_vinfo,
7143                                                           initial_def,
7144                                                           reduc_stmt_info);
7145         }
7146       else
7147         {
7148           tree adjustment_def = NULL_TREE;
7149           tree *adjustment_defp = &adjustment_def;
7150           enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
7151           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
7152             adjustment_defp = NULL;
7153           vec_initial_def
7154             = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info, code,
7155                                              initial_def, adjustment_defp);
7156           STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = adjustment_def;
7157         }
7158       vec_initial_defs.create (1);
7159       vec_initial_defs.quick_push (vec_initial_def);
7160     }
7161
7162   /* Generate the reduction PHIs upfront.  */
7163   prev_phi_info = NULL;
7164   for (i = 0; i < vec_num; i++)
7165     {
7166       tree vec_init_def = vec_initial_defs[i];
7167       for (j = 0; j < ncopies; j++)
7168         {
7169           /* Create the reduction-phi that defines the reduction
7170              operand.  */
7171           gphi *new_phi = create_phi_node (vec_dest, loop->header);
7172           stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
7173
7174           /* Set the loop-entry arg of the reduction-phi.  */
7175           if (j != 0 && nested_cycle)
7176             vec_init_def = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7177                                                            vec_init_def);
7178           add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
7179                        UNKNOWN_LOCATION);
7180
7181           /* The loop-latch arg is set in epilogue processing.  */
7182
7183           if (slp_node)
7184             SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi_info);
7185           else
7186             {
7187               if (j == 0)
7188                 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi_info;
7189               else
7190                 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi_info;
7191               prev_phi_info = new_phi_info;
7192             }
7193         }
7194     }
7195
7196   return true;
7197 }
7198
7199 /* Vectorizes LC PHIs.  */
7200
7201 bool
7202 vectorizable_lc_phi (loop_vec_info loop_vinfo,
7203                      stmt_vec_info stmt_info, stmt_vec_info *vec_stmt,
7204                      slp_tree slp_node)
7205 {
7206   if (!loop_vinfo
7207       || !is_a <gphi *> (stmt_info->stmt)
7208       || gimple_phi_num_args (stmt_info->stmt) != 1)
7209     return false;
7210
7211   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
7212       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
7213     return false;
7214
7215   if (!vec_stmt) /* transformation not required.  */
7216     {
7217       STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
7218       return true;
7219     }
7220
7221   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7222   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
7223   basic_block bb = gimple_bb (stmt_info->stmt);
7224   edge e = single_pred_edge (bb);
7225   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
7226   vec<tree> vec_oprnds = vNULL;
7227   vect_get_vec_defs (loop_vinfo,
7228                      gimple_phi_arg_def (stmt_info->stmt, 0), NULL_TREE,
7229                      stmt_info, &vec_oprnds, NULL, slp_node);
7230   if (slp_node)
7231     {
7232       unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7233       gcc_assert (vec_oprnds.length () == vec_num);
7234       for (unsigned i = 0; i < vec_num; i++)
7235         {
7236           /* Create the vectorized LC PHI node.  */
7237           gphi *new_phi = create_phi_node (vec_dest, bb);
7238           add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
7239           stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
7240           SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi_info);
7241         }
7242     }
7243   else
7244     {
7245       unsigned ncopies = vect_get_num_copies (loop_vinfo, vectype);
7246       stmt_vec_info prev_phi_info = NULL;
7247       for (unsigned i = 0; i < ncopies; i++)
7248         {
7249           if (i != 0)
7250             vect_get_vec_defs_for_stmt_copy (loop_vinfo, &vec_oprnds, NULL);
7251           /* Create the vectorized LC PHI node.  */
7252           gphi *new_phi = create_phi_node (vec_dest, bb);
7253           add_phi_arg (new_phi, vec_oprnds[0], e, UNKNOWN_LOCATION);
7254           stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
7255           if (i == 0)
7256             STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi_info;
7257           else
7258             STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi_info;
7259           prev_phi_info = new_phi_info;
7260         }
7261     }
7262   vec_oprnds.release ();
7263
7264   return true;
7265 }
7266
7267
7268 /* Function vect_min_worthwhile_factor.
7269
7270    For a loop where we could vectorize the operation indicated by CODE,
7271    return the minimum vectorization factor that makes it worthwhile
7272    to use generic vectors.  */
7273 static unsigned int
7274 vect_min_worthwhile_factor (enum tree_code code)
7275 {
7276   switch (code)
7277     {
7278     case PLUS_EXPR:
7279     case MINUS_EXPR:
7280     case NEGATE_EXPR:
7281       return 4;
7282
7283     case BIT_AND_EXPR:
7284     case BIT_IOR_EXPR:
7285     case BIT_XOR_EXPR:
7286     case BIT_NOT_EXPR:
7287       return 2;
7288
7289     default:
7290       return INT_MAX;
7291     }
7292 }
7293
7294 /* Return true if VINFO indicates we are doing loop vectorization and if
7295    it is worth decomposing CODE operations into scalar operations for
7296    that loop's vectorization factor.  */
7297
7298 bool
7299 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
7300 {
7301   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7302   unsigned HOST_WIDE_INT value;
7303   return (loop_vinfo
7304           && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7305           && value >= vect_min_worthwhile_factor (code));
7306 }
7307
7308 /* Function vectorizable_induction
7309
7310    Check if STMT_INFO performs an induction computation that can be vectorized.
7311    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7312    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7313    Return true if STMT_INFO is vectorizable in this way.  */
7314
7315 bool
7316 vectorizable_induction (loop_vec_info loop_vinfo,
7317                         stmt_vec_info stmt_info,
7318                         gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7319                         stmt_vec_info *vec_stmt, slp_tree slp_node,
7320                         stmt_vector_for_cost *cost_vec)
7321 {
7322   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7323   unsigned ncopies;
7324   bool nested_in_vect_loop = false;
7325   class loop *iv_loop;
7326   tree vec_def;
7327   edge pe = loop_preheader_edge (loop);
7328   basic_block new_bb;
7329   tree new_vec, vec_init, vec_step, t;
7330   tree new_name;
7331   gimple *new_stmt;
7332   gphi *induction_phi;
7333   tree induc_def, vec_dest;
7334   tree init_expr, step_expr;
7335   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7336   unsigned i;
7337   tree expr;
7338   gimple_seq stmts;
7339   imm_use_iterator imm_iter;
7340   use_operand_p use_p;
7341   gimple *exit_phi;
7342   edge latch_e;
7343   tree loop_arg;
7344   gimple_stmt_iterator si;
7345
7346   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
7347   if (!phi)
7348     return false;
7349
7350   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7351     return false;
7352
7353   /* Make sure it was recognized as induction computation.  */
7354   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7355     return false;
7356
7357   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7358   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7359
7360   if (slp_node)
7361     ncopies = 1;
7362   else
7363     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7364   gcc_assert (ncopies >= 1);
7365
7366   /* FORNOW. These restrictions should be relaxed.  */
7367   if (nested_in_vect_loop_p (loop, stmt_info))
7368     {
7369       imm_use_iterator imm_iter;
7370       use_operand_p use_p;
7371       gimple *exit_phi;
7372       edge latch_e;
7373       tree loop_arg;
7374
7375       if (ncopies > 1)
7376         {
7377           if (dump_enabled_p ())
7378             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7379                              "multiple types in nested loop.\n");
7380           return false;
7381         }
7382
7383       /* FORNOW: outer loop induction with SLP not supported.  */
7384       if (STMT_SLP_TYPE (stmt_info))
7385         return false;
7386
7387       exit_phi = NULL;
7388       latch_e = loop_latch_edge (loop->inner);
7389       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7390       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7391         {
7392           gimple *use_stmt = USE_STMT (use_p);
7393           if (is_gimple_debug (use_stmt))
7394             continue;
7395
7396           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7397             {
7398               exit_phi = use_stmt;
7399               break;
7400             }
7401         }
7402       if (exit_phi)
7403         {
7404           stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7405           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7406                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7407             {
7408               if (dump_enabled_p ())
7409                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7410                                  "inner-loop induction only used outside "
7411                                  "of the outer vectorized loop.\n");
7412               return false;
7413             }
7414         }
7415
7416       nested_in_vect_loop = true;
7417       iv_loop = loop->inner;
7418     }
7419   else
7420     iv_loop = loop;
7421   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7422
7423   if (slp_node && !nunits.is_constant ())
7424     {
7425       /* The current SLP code creates the initial value element-by-element.  */
7426       if (dump_enabled_p ())
7427         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7428                          "SLP induction not supported for variable-length"
7429                          " vectors.\n");
7430       return false;
7431     }
7432
7433   if (!vec_stmt) /* transformation not required.  */
7434     {
7435       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
7436       DUMP_VECT_SCOPE ("vectorizable_induction");
7437       vect_model_induction_cost (stmt_info, ncopies, cost_vec);
7438       return true;
7439     }
7440
7441   /* Transform.  */
7442
7443   /* Compute a vector variable, initialized with the first VF values of
7444      the induction variable.  E.g., for an iv with IV_PHI='X' and
7445      evolution S, for a vector of 4 units, we want to compute:
7446      [X, X + S, X + 2*S, X + 3*S].  */
7447
7448   if (dump_enabled_p ())
7449     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
7450
7451   latch_e = loop_latch_edge (iv_loop);
7452   loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7453
7454   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
7455   gcc_assert (step_expr != NULL_TREE);
7456   tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
7457
7458   pe = loop_preheader_edge (iv_loop);
7459   init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
7460                                      loop_preheader_edge (iv_loop));
7461
7462   stmts = NULL;
7463   if (!nested_in_vect_loop)
7464     {
7465       /* Convert the initial value to the IV update type.  */
7466       tree new_type = TREE_TYPE (step_expr);
7467       init_expr = gimple_convert (&stmts, new_type, init_expr);
7468
7469       /* If we are using the loop mask to "peel" for alignment then we need
7470          to adjust the start value here.  */
7471       tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
7472       if (skip_niters != NULL_TREE)
7473         {
7474           if (FLOAT_TYPE_P (vectype))
7475             skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
7476                                         skip_niters);
7477           else
7478             skip_niters = gimple_convert (&stmts, new_type, skip_niters);
7479           tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
7480                                          skip_niters, step_expr);
7481           init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
7482                                     init_expr, skip_step);
7483         }
7484     }
7485
7486   if (stmts)
7487     {
7488       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7489       gcc_assert (!new_bb);
7490     }
7491
7492   /* Find the first insertion point in the BB.  */
7493   basic_block bb = gimple_bb (phi);
7494   si = gsi_after_labels (bb);
7495
7496   /* For SLP induction we have to generate several IVs as for example
7497      with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
7498      [i + 2*S, i + 3*S, i + 3*S, i + 3*S].  The step is the same uniform
7499      [VF*S, VF*S, VF*S, VF*S] for all.  */
7500   if (slp_node)
7501     {
7502       /* Enforced above.  */
7503       unsigned int const_nunits = nunits.to_constant ();
7504
7505       /* Generate [VF*S, VF*S, ... ].  */
7506       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7507         {
7508           expr = build_int_cst (integer_type_node, vf);
7509           expr = fold_convert (TREE_TYPE (step_expr), expr);
7510         }
7511       else
7512         expr = build_int_cst (TREE_TYPE (step_expr), vf);
7513       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7514                               expr, step_expr);
7515       if (! CONSTANT_CLASS_P (new_name))
7516         new_name = vect_init_vector (loop_vinfo, stmt_info, new_name,
7517                                      TREE_TYPE (step_expr), NULL);
7518       new_vec = build_vector_from_val (step_vectype, new_name);
7519       vec_step = vect_init_vector (loop_vinfo, stmt_info,
7520                                    new_vec, step_vectype, NULL);
7521
7522       /* Now generate the IVs.  */
7523       unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7524       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7525       unsigned elts = const_nunits * nvects;
7526       unsigned nivs = least_common_multiple (group_size,
7527                                              const_nunits) / const_nunits;
7528       gcc_assert (elts % group_size == 0);
7529       tree elt = init_expr;
7530       unsigned ivn;
7531       for (ivn = 0; ivn < nivs; ++ivn)
7532         {
7533           tree_vector_builder elts (step_vectype, const_nunits, 1);
7534           stmts = NULL;
7535           for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
7536             {
7537               if (ivn*const_nunits + eltn >= group_size
7538                   && (ivn * const_nunits + eltn) % group_size == 0)
7539                 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
7540                                     elt, step_expr);
7541               elts.quick_push (elt);
7542             }
7543           vec_init = gimple_build_vector (&stmts, &elts);
7544           vec_init = gimple_convert (&stmts, vectype, vec_init);
7545           if (stmts)
7546             {
7547               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7548               gcc_assert (!new_bb);
7549             }
7550
7551           /* Create the induction-phi that defines the induction-operand.  */
7552           vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7553           induction_phi = create_phi_node (vec_dest, iv_loop->header);
7554           stmt_vec_info induction_phi_info
7555             = loop_vinfo->add_stmt (induction_phi);
7556           induc_def = PHI_RESULT (induction_phi);
7557
7558           /* Create the iv update inside the loop  */
7559           gimple_seq stmts = NULL;
7560           vec_def = gimple_convert (&stmts, step_vectype, induc_def);
7561           vec_def = gimple_build (&stmts,
7562                                   PLUS_EXPR, step_vectype, vec_def, vec_step);
7563           vec_def = gimple_convert (&stmts, vectype, vec_def);
7564           loop_vinfo->add_stmt (SSA_NAME_DEF_STMT (vec_def));
7565           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
7566
7567           /* Set the arguments of the phi node:  */
7568           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7569           add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7570                        UNKNOWN_LOCATION);
7571
7572           SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi_info);
7573         }
7574
7575       /* Re-use IVs when we can.  */
7576       if (ivn < nvects)
7577         {
7578           unsigned vfp
7579             = least_common_multiple (group_size, const_nunits) / group_size;
7580           /* Generate [VF'*S, VF'*S, ... ].  */
7581           if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7582             {
7583               expr = build_int_cst (integer_type_node, vfp);
7584               expr = fold_convert (TREE_TYPE (step_expr), expr);
7585             }
7586           else
7587             expr = build_int_cst (TREE_TYPE (step_expr), vfp);
7588           new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7589                                   expr, step_expr);
7590           if (! CONSTANT_CLASS_P (new_name))
7591             new_name = vect_init_vector (loop_vinfo, stmt_info, new_name,
7592                                          TREE_TYPE (step_expr), NULL);
7593           new_vec = build_vector_from_val (step_vectype, new_name);
7594           vec_step = vect_init_vector (loop_vinfo, stmt_info, new_vec,
7595                                        step_vectype, NULL);
7596           for (; ivn < nvects; ++ivn)
7597             {
7598               gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs]->stmt;
7599               tree def;
7600               if (gimple_code (iv) == GIMPLE_PHI)
7601                 def = gimple_phi_result (iv);
7602               else
7603                 def = gimple_assign_lhs (iv);
7604               gimple_seq stmts = NULL;
7605               def = gimple_convert (&stmts, step_vectype, def);
7606               def = gimple_build (&stmts,
7607                                   PLUS_EXPR, step_vectype, def, vec_step);
7608               def = gimple_convert (&stmts, vectype, def);
7609               if (gimple_code (iv) == GIMPLE_PHI)
7610                 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
7611               else
7612                 {
7613                   gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
7614                   gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
7615                 }
7616               SLP_TREE_VEC_STMTS (slp_node).quick_push
7617                 (loop_vinfo->add_stmt (SSA_NAME_DEF_STMT (def)));
7618             }
7619         }
7620
7621       return true;
7622     }
7623
7624   /* Create the vector that holds the initial_value of the induction.  */
7625   if (nested_in_vect_loop)
7626     {
7627       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
7628          been created during vectorization of previous stmts.  We obtain it
7629          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
7630       vec_init = vect_get_vec_def_for_operand (loop_vinfo,
7631                                                init_expr, stmt_info);
7632       /* If the initial value is not of proper type, convert it.  */
7633       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
7634         {
7635           new_stmt
7636             = gimple_build_assign (vect_get_new_ssa_name (vectype,
7637                                                           vect_simple_var,
7638                                                           "vec_iv_"),
7639                                    VIEW_CONVERT_EXPR,
7640                                    build1 (VIEW_CONVERT_EXPR, vectype,
7641                                            vec_init));
7642           vec_init = gimple_assign_lhs (new_stmt);
7643           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
7644                                                  new_stmt);
7645           gcc_assert (!new_bb);
7646           loop_vinfo->add_stmt (new_stmt);
7647         }
7648     }
7649   else
7650     {
7651       /* iv_loop is the loop to be vectorized. Create:
7652          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
7653       stmts = NULL;
7654       new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
7655
7656       unsigned HOST_WIDE_INT const_nunits;
7657       if (nunits.is_constant (&const_nunits))
7658         {
7659           tree_vector_builder elts (step_vectype, const_nunits, 1);
7660           elts.quick_push (new_name);
7661           for (i = 1; i < const_nunits; i++)
7662             {
7663               /* Create: new_name_i = new_name + step_expr  */
7664               new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
7665                                        new_name, step_expr);
7666               elts.quick_push (new_name);
7667             }
7668           /* Create a vector from [new_name_0, new_name_1, ...,
7669              new_name_nunits-1]  */
7670           vec_init = gimple_build_vector (&stmts, &elts);
7671         }
7672       else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
7673         /* Build the initial value directly from a VEC_SERIES_EXPR.  */
7674         vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
7675                                  new_name, step_expr);
7676       else
7677         {
7678           /* Build:
7679                 [base, base, base, ...]
7680                 + (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
7681           gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
7682           gcc_assert (flag_associative_math);
7683           tree index = build_index_vector (step_vectype, 0, 1);
7684           tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
7685                                                         new_name);
7686           tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
7687                                                         step_expr);
7688           vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
7689           vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
7690                                    vec_init, step_vec);
7691           vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
7692                                    vec_init, base_vec);
7693         }
7694       vec_init = gimple_convert (&stmts, vectype, vec_init);
7695
7696       if (stmts)
7697         {
7698           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7699           gcc_assert (!new_bb);
7700         }
7701     }
7702
7703
7704   /* Create the vector that holds the step of the induction.  */
7705   if (nested_in_vect_loop)
7706     /* iv_loop is nested in the loop to be vectorized. Generate:
7707        vec_step = [S, S, S, S]  */
7708     new_name = step_expr;
7709   else
7710     {
7711       /* iv_loop is the loop to be vectorized. Generate:
7712           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
7713       gimple_seq seq = NULL;
7714       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7715         {
7716           expr = build_int_cst (integer_type_node, vf);
7717           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7718         }
7719       else
7720         expr = build_int_cst (TREE_TYPE (step_expr), vf);
7721       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7722                                expr, step_expr);
7723       if (seq)
7724         {
7725           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7726           gcc_assert (!new_bb);
7727         }
7728     }
7729
7730   t = unshare_expr (new_name);
7731   gcc_assert (CONSTANT_CLASS_P (new_name)
7732               || TREE_CODE (new_name) == SSA_NAME);
7733   new_vec = build_vector_from_val (step_vectype, t);
7734   vec_step = vect_init_vector (loop_vinfo, stmt_info,
7735                                new_vec, step_vectype, NULL);
7736
7737
7738   /* Create the following def-use cycle:
7739      loop prolog:
7740          vec_init = ...
7741          vec_step = ...
7742      loop:
7743          vec_iv = PHI <vec_init, vec_loop>
7744          ...
7745          STMT
7746          ...
7747          vec_loop = vec_iv + vec_step;  */
7748
7749   /* Create the induction-phi that defines the induction-operand.  */
7750   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7751   induction_phi = create_phi_node (vec_dest, iv_loop->header);
7752   stmt_vec_info induction_phi_info = loop_vinfo->add_stmt (induction_phi);
7753   induc_def = PHI_RESULT (induction_phi);
7754
7755   /* Create the iv update inside the loop  */
7756   stmts = NULL;
7757   vec_def = gimple_convert (&stmts, step_vectype, induc_def);
7758   vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
7759   vec_def = gimple_convert (&stmts, vectype, vec_def);
7760   gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
7761   new_stmt = SSA_NAME_DEF_STMT (vec_def);
7762   stmt_vec_info new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7763
7764   /* Set the arguments of the phi node:  */
7765   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7766   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7767                UNKNOWN_LOCATION);
7768
7769   STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi_info;
7770
7771   /* In case that vectorization factor (VF) is bigger than the number
7772      of elements that we can fit in a vectype (nunits), we have to generate
7773      more than one vector stmt - i.e - we need to "unroll" the
7774      vector stmt by a factor VF/nunits.  For more details see documentation
7775      in vectorizable_operation.  */
7776
7777   if (ncopies > 1)
7778     {
7779       gimple_seq seq = NULL;
7780       stmt_vec_info prev_stmt_vinfo;
7781       /* FORNOW. This restriction should be relaxed.  */
7782       gcc_assert (!nested_in_vect_loop);
7783
7784       /* Create the vector that holds the step of the induction.  */
7785       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7786         {
7787           expr = build_int_cst (integer_type_node, nunits);
7788           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7789         }
7790       else
7791         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
7792       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7793                                expr, step_expr);
7794       if (seq)
7795         {
7796           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7797           gcc_assert (!new_bb);
7798         }
7799
7800       t = unshare_expr (new_name);
7801       gcc_assert (CONSTANT_CLASS_P (new_name)
7802                   || TREE_CODE (new_name) == SSA_NAME);
7803       new_vec = build_vector_from_val (step_vectype, t);
7804       vec_step = vect_init_vector (loop_vinfo, stmt_info,
7805                                    new_vec, step_vectype, NULL);
7806
7807       vec_def = induc_def;
7808       prev_stmt_vinfo = induction_phi_info;
7809       for (i = 1; i < ncopies; i++)
7810         {
7811           /* vec_i = vec_prev + vec_step  */
7812           gimple_seq stmts = NULL;
7813           vec_def = gimple_convert (&stmts, step_vectype, vec_def);
7814           vec_def = gimple_build (&stmts,
7815                                   PLUS_EXPR, step_vectype, vec_def, vec_step);
7816           vec_def = gimple_convert (&stmts, vectype, vec_def);
7817
7818           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
7819           new_stmt = SSA_NAME_DEF_STMT (vec_def);
7820           new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7821           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt_info;
7822           prev_stmt_vinfo = new_stmt_info;
7823         }
7824     }
7825
7826   if (nested_in_vect_loop)
7827     {
7828       /* Find the loop-closed exit-phi of the induction, and record
7829          the final vector of induction results:  */
7830       exit_phi = NULL;
7831       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7832         {
7833           gimple *use_stmt = USE_STMT (use_p);
7834           if (is_gimple_debug (use_stmt))
7835             continue;
7836
7837           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
7838             {
7839               exit_phi = use_stmt;
7840               break;
7841             }
7842         }
7843       if (exit_phi)
7844         {
7845           stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7846           /* FORNOW. Currently not supporting the case that an inner-loop induction
7847              is not used in the outer-loop (i.e. only outside the outer-loop).  */
7848           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
7849                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
7850
7851           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt_info;
7852           if (dump_enabled_p ())
7853             dump_printf_loc (MSG_NOTE, vect_location,
7854                              "vector of inductions after inner-loop:%G",
7855                              new_stmt);
7856         }
7857     }
7858
7859
7860   if (dump_enabled_p ())
7861     dump_printf_loc (MSG_NOTE, vect_location,
7862                      "transform induction: created def-use cycle: %G%G",
7863                      induction_phi, SSA_NAME_DEF_STMT (vec_def));
7864
7865   return true;
7866 }
7867
7868 /* Function vectorizable_live_operation.
7869
7870    STMT_INFO computes a value that is used outside the loop.  Check if
7871    it can be supported.  */
7872
7873 bool
7874 vectorizable_live_operation (loop_vec_info loop_vinfo,
7875                              stmt_vec_info stmt_info,
7876                              gimple_stmt_iterator *gsi,
7877                              slp_tree slp_node, slp_instance slp_node_instance,
7878                              int slp_index, bool vec_stmt_p,
7879                              stmt_vector_for_cost *)
7880 {
7881   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7882   imm_use_iterator imm_iter;
7883   tree lhs, lhs_type, bitsize, vec_bitsize;
7884   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7885   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7886   int ncopies;
7887   gimple *use_stmt;
7888   auto_vec<tree> vec_oprnds;
7889   int vec_entry = 0;
7890   poly_uint64 vec_index = 0;
7891
7892   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
7893
7894   /* If a stmt of a reduction is live, vectorize it via
7895      vect_create_epilog_for_reduction.  vectorizable_reduction assessed
7896      validity so just trigger the transform here.  */
7897   if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
7898     {
7899       if (!vec_stmt_p)
7900         return true;
7901       if (slp_node)
7902         {
7903           /* For reduction chains the meta-info is attached to
7904              the group leader.  */
7905           if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7906             stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
7907           /* For SLP reductions we vectorize the epilogue for
7908              all involved stmts together.  */
7909           else if (slp_index != 0)
7910             return true;
7911         }
7912       stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7913       gcc_assert (reduc_info->is_reduc_info);
7914       if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
7915           || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
7916         return true;
7917       vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
7918                                         slp_node_instance);
7919       return true;
7920     }
7921
7922   /* FORNOW.  CHECKME.  */
7923   if (nested_in_vect_loop_p (loop, stmt_info))
7924     return false;
7925
7926   /* If STMT is not relevant and it is a simple assignment and its inputs are
7927      invariant then it can remain in place, unvectorized.  The original last
7928      scalar value that it computes will be used.  */
7929   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7930     {
7931       gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
7932       if (dump_enabled_p ())
7933         dump_printf_loc (MSG_NOTE, vect_location,
7934                          "statement is simple and uses invariant.  Leaving in "
7935                          "place.\n");
7936       return true;
7937     }
7938
7939   if (slp_node)
7940     ncopies = 1;
7941   else
7942     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7943
7944   if (slp_node)
7945     {
7946       gcc_assert (slp_index >= 0);
7947
7948       int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7949       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7950
7951       /* Get the last occurrence of the scalar index from the concatenation of
7952          all the slp vectors. Calculate which slp vector it is and the index
7953          within.  */
7954       poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
7955
7956       /* Calculate which vector contains the result, and which lane of
7957          that vector we need.  */
7958       if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
7959         {
7960           if (dump_enabled_p ())
7961             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7962                              "Cannot determine which vector holds the"
7963                              " final result.\n");
7964           return false;
7965         }
7966     }
7967
7968   if (!vec_stmt_p)
7969     {
7970       /* No transformation required.  */
7971       if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
7972         {
7973           if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
7974                                                OPTIMIZE_FOR_SPEED))
7975             {
7976               if (dump_enabled_p ())
7977                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7978                                  "can't use a fully-masked loop because "
7979                                  "the target doesn't support extract last "
7980                                  "reduction.\n");
7981               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7982             }
7983           else if (slp_node)
7984             {
7985               if (dump_enabled_p ())
7986                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7987                                  "can't use a fully-masked loop because an "
7988                                  "SLP statement is live after the loop.\n");
7989               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7990             }
7991           else if (ncopies > 1)
7992             {
7993               if (dump_enabled_p ())
7994                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7995                                  "can't use a fully-masked loop because"
7996                                  " ncopies is greater than 1.\n");
7997               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7998             }
7999           else
8000             {
8001               gcc_assert (ncopies == 1 && !slp_node);
8002               vect_record_loop_mask (loop_vinfo,
8003                                      &LOOP_VINFO_MASKS (loop_vinfo),
8004                                      1, vectype, NULL);
8005             }
8006         }
8007       return true;
8008     }
8009
8010   /* Use the lhs of the original scalar statement.  */
8011   gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
8012
8013   lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
8014         : gimple_get_lhs (stmt);
8015   lhs_type = TREE_TYPE (lhs);
8016
8017   bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype)
8018              ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype)))
8019              : TYPE_SIZE (TREE_TYPE (vectype)));
8020   vec_bitsize = TYPE_SIZE (vectype);
8021
8022   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
8023   tree vec_lhs, bitstart;
8024   if (slp_node)
8025     {
8026       gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8027
8028       /* Get the correct slp vectorized stmt.  */
8029       gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry]->stmt;
8030       if (gphi *phi = dyn_cast <gphi *> (vec_stmt))
8031         vec_lhs = gimple_phi_result (phi);
8032       else
8033         vec_lhs = gimple_get_lhs (vec_stmt);
8034
8035       /* Get entry to use.  */
8036       bitstart = bitsize_int (vec_index);
8037       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
8038     }
8039   else
8040     {
8041       enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
8042       vec_lhs = vect_get_vec_def_for_operand_1 (stmt_info, dt);
8043       gcc_checking_assert (ncopies == 1
8044                            || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8045
8046       /* For multiple copies, get the last copy.  */
8047       for (int i = 1; i < ncopies; ++i)
8048         vec_lhs = vect_get_vec_def_for_stmt_copy (loop_vinfo, vec_lhs);
8049
8050       /* Get the last lane in the vector.  */
8051       bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
8052     }
8053
8054   /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
8055      requirement, insert one phi node for it.  It looks like:
8056          loop;
8057        BB:
8058          # lhs' = PHI <lhs>
8059      ==>
8060          loop;
8061        BB:
8062          # vec_lhs' = PHI <vec_lhs>
8063          new_tree = lane_extract <vec_lhs', ...>;
8064          lhs' = new_tree;  */
8065
8066   basic_block exit_bb = single_exit (loop)->dest;
8067   gcc_assert (single_pred_p (exit_bb));
8068
8069   tree vec_lhs_phi = copy_ssa_name (vec_lhs);
8070   gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
8071   SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, vec_lhs);
8072
8073   gimple_seq stmts = NULL;
8074   tree new_tree;
8075   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8076     {
8077       /* Emit:
8078
8079            SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
8080
8081          where VEC_LHS is the vectorized live-out result and MASK is
8082          the loop mask for the final iteration.  */
8083       gcc_assert (ncopies == 1 && !slp_node);
8084       tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
8085       tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo), 1,
8086                                       vectype, 0);
8087       tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
8088                                       mask, vec_lhs_phi);
8089
8090       /* Convert the extracted vector element to the required scalar type.  */
8091       new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
8092     }
8093   else
8094     {
8095       tree bftype = TREE_TYPE (vectype);
8096       if (VECTOR_BOOLEAN_TYPE_P (vectype))
8097         bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8098       new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs_phi, bitsize, bitstart);
8099       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8100                                        &stmts, true, NULL_TREE);
8101     }
8102
8103   if (stmts)
8104     {
8105       gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb);
8106       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
8107
8108       /* Remove existing phi from lhs and create one copy from new_tree.  */
8109       tree lhs_phi = NULL_TREE;
8110       gimple_stmt_iterator gsi;
8111       for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
8112         {
8113           gimple *phi = gsi_stmt (gsi);
8114           if ((gimple_phi_arg_def (phi, 0) == lhs))
8115             {
8116               remove_phi_node (&gsi, false);
8117               lhs_phi = gimple_phi_result (phi);
8118               gimple *copy = gimple_build_assign (lhs_phi, new_tree);
8119               gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
8120               break;
8121             }
8122         }
8123     }
8124
8125   /* Replace use of lhs with newly computed result.  If the use stmt is a
8126      single arg PHI, just replace all uses of PHI result.  It's necessary
8127      because lcssa PHI defining lhs may be before newly inserted stmt.  */
8128   use_operand_p use_p;
8129   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8130     if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
8131         && !is_gimple_debug (use_stmt))
8132     {
8133       if (gimple_code (use_stmt) == GIMPLE_PHI
8134           && gimple_phi_num_args (use_stmt) == 1)
8135         {
8136           replace_uses_by (gimple_phi_result (use_stmt), new_tree);
8137         }
8138       else
8139         {
8140           FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8141             SET_USE (use_p, new_tree);
8142         }
8143       update_stmt (use_stmt);
8144     }
8145
8146   return true;
8147 }
8148
8149 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO.  */
8150
8151 static void
8152 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
8153 {
8154   ssa_op_iter op_iter;
8155   imm_use_iterator imm_iter;
8156   def_operand_p def_p;
8157   gimple *ustmt;
8158
8159   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
8160     {
8161       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
8162         {
8163           basic_block bb;
8164
8165           if (!is_gimple_debug (ustmt))
8166             continue;
8167
8168           bb = gimple_bb (ustmt);
8169
8170           if (!flow_bb_inside_loop_p (loop, bb))
8171             {
8172               if (gimple_debug_bind_p (ustmt))
8173                 {
8174                   if (dump_enabled_p ())
8175                     dump_printf_loc (MSG_NOTE, vect_location,
8176                                      "killing debug use\n");
8177
8178                   gimple_debug_bind_reset_value (ustmt);
8179                   update_stmt (ustmt);
8180                 }
8181               else
8182                 gcc_unreachable ();
8183             }
8184         }
8185     }
8186 }
8187
8188 /* Given loop represented by LOOP_VINFO, return true if computation of
8189    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
8190    otherwise.  */
8191
8192 static bool
8193 loop_niters_no_overflow (loop_vec_info loop_vinfo)
8194 {
8195   /* Constant case.  */
8196   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8197     {
8198       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
8199       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
8200
8201       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
8202       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
8203       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
8204         return true;
8205     }
8206
8207   widest_int max;
8208   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8209   /* Check the upper bound of loop niters.  */
8210   if (get_max_loop_iterations (loop, &max))
8211     {
8212       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
8213       signop sgn = TYPE_SIGN (type);
8214       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
8215       if (max < type_max)
8216         return true;
8217     }
8218   return false;
8219 }
8220
8221 /* Return a mask type with half the number of elements as OLD_TYPE,
8222    given that it should have mode NEW_MODE.  */
8223
8224 tree
8225 vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
8226 {
8227   poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
8228   return build_truth_vector_type_for_mode (nunits, new_mode);
8229 }
8230
8231 /* Return a mask type with twice as many elements as OLD_TYPE,
8232    given that it should have mode NEW_MODE.  */
8233
8234 tree
8235 vect_double_mask_nunits (tree old_type, machine_mode new_mode)
8236 {
8237   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
8238   return build_truth_vector_type_for_mode (nunits, new_mode);
8239 }
8240
8241 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8242    contain a sequence of NVECTORS masks that each control a vector of type
8243    VECTYPE.  If SCALAR_MASK is nonnull, the fully-masked loop would AND
8244    these vector masks with the vector version of SCALAR_MASK.  */
8245
8246 void
8247 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
8248                        unsigned int nvectors, tree vectype, tree scalar_mask)
8249 {
8250   gcc_assert (nvectors != 0);
8251   if (masks->length () < nvectors)
8252     masks->safe_grow_cleared (nvectors);
8253   rgroup_masks *rgm = &(*masks)[nvectors - 1];
8254   /* The number of scalars per iteration and the number of vectors are
8255      both compile-time constants.  */
8256   unsigned int nscalars_per_iter
8257     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8258                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8259
8260   if (scalar_mask)
8261     {
8262       scalar_cond_masked_key cond (scalar_mask, nvectors);
8263       loop_vinfo->scalar_cond_masked_set.add (cond);
8264     }
8265
8266   if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8267     {
8268       rgm->max_nscalars_per_iter = nscalars_per_iter;
8269       rgm->mask_type = truth_type_for (vectype);
8270     }
8271 }
8272
8273 /* Given a complete set of masks MASKS, extract mask number INDEX
8274    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8275    where 0 <= INDEX < NVECTORS.  Insert any set-up statements before GSI.
8276
8277    See the comment above vec_loop_masks for more details about the mask
8278    arrangement.  */
8279
8280 tree
8281 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
8282                     unsigned int nvectors, tree vectype, unsigned int index)
8283 {
8284   rgroup_masks *rgm = &(*masks)[nvectors - 1];
8285   tree mask_type = rgm->mask_type;
8286
8287   /* Populate the rgroup's mask array, if this is the first time we've
8288      used it.  */
8289   if (rgm->masks.is_empty ())
8290     {
8291       rgm->masks.safe_grow_cleared (nvectors);
8292       for (unsigned int i = 0; i < nvectors; ++i)
8293         {
8294           tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
8295           /* Provide a dummy definition until the real one is available.  */
8296           SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
8297           rgm->masks[i] = mask;
8298         }
8299     }
8300
8301   tree mask = rgm->masks[index];
8302   if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
8303                 TYPE_VECTOR_SUBPARTS (vectype)))
8304     {
8305       /* A loop mask for data type X can be reused for data type Y
8306          if X has N times more elements than Y and if Y's elements
8307          are N times bigger than X's.  In this case each sequence
8308          of N elements in the loop mask will be all-zero or all-one.
8309          We can then view-convert the mask so that each sequence of
8310          N elements is replaced by a single element.  */
8311       gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
8312                               TYPE_VECTOR_SUBPARTS (vectype)));
8313       gimple_seq seq = NULL;
8314       mask_type = truth_type_for (vectype);
8315       mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
8316       if (seq)
8317         gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
8318     }
8319   return mask;
8320 }
8321
8322 /* Scale profiling counters by estimation for LOOP which is vectorized
8323    by factor VF.  */
8324
8325 static void
8326 scale_profile_for_vect_loop (class loop *loop, unsigned vf)
8327 {
8328   edge preheader = loop_preheader_edge (loop);
8329   /* Reduce loop iterations by the vectorization factor.  */
8330   gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
8331   profile_count freq_h = loop->header->count, freq_e = preheader->count ();
8332
8333   if (freq_h.nonzero_p ())
8334     {
8335       profile_probability p;
8336
8337       /* Avoid dropping loop body profile counter to 0 because of zero count
8338          in loop's preheader.  */
8339       if (!(freq_e == profile_count::zero ()))
8340         freq_e = freq_e.force_nonzero ();
8341       p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
8342       scale_loop_frequencies (loop, p);
8343     }
8344
8345   edge exit_e = single_exit (loop);
8346   exit_e->probability = profile_probability::always ()
8347                                  .apply_scale (1, new_est_niter + 1);
8348
8349   edge exit_l = single_pred_edge (loop->latch);
8350   profile_probability prob = exit_l->probability;
8351   exit_l->probability = exit_e->probability.invert ();
8352   if (prob.initialized_p () && exit_l->probability.initialized_p ())
8353     scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
8354 }
8355
8356 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
8357    When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
8358    stmt_vec_info.  */
8359
8360 static void
8361 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8362                           gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
8363 {
8364   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8365   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8366
8367   if (dump_enabled_p ())
8368     dump_printf_loc (MSG_NOTE, vect_location,
8369                      "------>vectorizing statement: %G", stmt_info->stmt);
8370
8371   if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8372     vect_loop_kill_debug_uses (loop, stmt_info);
8373
8374   if (!STMT_VINFO_RELEVANT_P (stmt_info)
8375       && !STMT_VINFO_LIVE_P (stmt_info))
8376     return;
8377
8378   if (STMT_VINFO_VECTYPE (stmt_info))
8379     {
8380       poly_uint64 nunits
8381         = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8382       if (!STMT_SLP_TYPE (stmt_info)
8383           && maybe_ne (nunits, vf)
8384           && dump_enabled_p ())
8385         /* For SLP VF is set according to unrolling factor, and not
8386            to vector size, hence for SLP this print is not valid.  */
8387         dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8388     }
8389
8390   /* Pure SLP statements have already been vectorized.  We still need
8391      to apply loop vectorization to hybrid SLP statements.  */
8392   if (PURE_SLP_STMT (stmt_info))
8393     return;
8394
8395   if (dump_enabled_p ())
8396     dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
8397
8398   if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL))
8399     *seen_store = stmt_info;
8400 }
8401
8402 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
8403    in the hash_map with its corresponding values.  */
8404
8405 static tree
8406 find_in_mapping (tree t, void *context)
8407 {
8408   hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
8409
8410   tree *value = mapping->get (t);
8411   return value ? *value : t;
8412 }
8413
8414 /* Update EPILOGUE's loop_vec_info.  EPILOGUE was constructed as a copy of the
8415    original loop that has now been vectorized.
8416
8417    The inits of the data_references need to be advanced with the number of
8418    iterations of the main loop.  This has been computed in vect_do_peeling and
8419    is stored in parameter ADVANCE.  We first restore the data_references
8420    initial offset with the values recored in ORIG_DRS_INIT.
8421
8422    Since the loop_vec_info of this EPILOGUE was constructed for the original
8423    loop, its stmt_vec_infos all point to the original statements.  These need
8424    to be updated to point to their corresponding copies as well as the SSA_NAMES
8425    in their PATTERN_DEF_SEQs and RELATED_STMTs.
8426
8427    The data_reference's connections also need to be updated.  Their
8428    corresponding dr_vec_info need to be reconnected to the EPILOGUE's
8429    stmt_vec_infos, their statements need to point to their corresponding copy,
8430    if they are gather loads or scatter stores then their reference needs to be
8431    updated to point to its corresponding copy and finally we set
8432    'base_misaligned' to false as we have already peeled for alignment in the
8433    prologue of the main loop.  */
8434
8435 static void
8436 update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
8437 {
8438   loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
8439   auto_vec<gimple *> stmt_worklist;
8440   hash_map<tree,tree> mapping;
8441   gimple *orig_stmt, *new_stmt;
8442   gimple_stmt_iterator epilogue_gsi;
8443   gphi_iterator epilogue_phi_gsi;
8444   stmt_vec_info stmt_vinfo = NULL, related_vinfo;
8445   basic_block *epilogue_bbs = get_loop_body (epilogue);
8446   unsigned i;
8447
8448   LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
8449
8450   /* Advance data_reference's with the number of iterations of the previous
8451      loop and its prologue.  */
8452   vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
8453
8454
8455   /* The EPILOGUE loop is a copy of the original loop so they share the same
8456      gimple UIDs.  In this loop we update the loop_vec_info of the EPILOGUE to
8457      point to the copied statements.  We also create a mapping of all LHS' in
8458      the original loop and all the LHS' in the EPILOGUE and create worklists to
8459      update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs.  */
8460   for (unsigned i = 0; i < epilogue->num_nodes; ++i)
8461     {
8462       for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
8463            !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
8464         {
8465           new_stmt = epilogue_phi_gsi.phi ();
8466
8467           gcc_assert (gimple_uid (new_stmt) > 0);
8468           stmt_vinfo
8469             = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
8470
8471           orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
8472           STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
8473
8474           mapping.put (gimple_phi_result (orig_stmt),
8475                        gimple_phi_result (new_stmt));
8476           /* PHI nodes can not have patterns or related statements.  */
8477           gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
8478                       && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
8479         }
8480
8481       for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
8482            !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
8483         {
8484           new_stmt = gsi_stmt (epilogue_gsi);
8485
8486           gcc_assert (gimple_uid (new_stmt) > 0);
8487           stmt_vinfo
8488             = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
8489
8490           orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
8491           STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
8492
8493           if (tree old_lhs = gimple_get_lhs (orig_stmt))
8494             mapping.put (old_lhs, gimple_get_lhs (new_stmt));
8495
8496           if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
8497             {
8498               gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
8499               for (gimple_stmt_iterator gsi = gsi_start (seq);
8500                    !gsi_end_p (gsi); gsi_next (&gsi))
8501                 stmt_worklist.safe_push (gsi_stmt (gsi));
8502             }
8503
8504           related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
8505           if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
8506             {
8507               gimple *stmt = STMT_VINFO_STMT (related_vinfo);
8508               stmt_worklist.safe_push (stmt);
8509               /* Set BB such that the assert in
8510                 'get_initial_def_for_reduction' is able to determine that
8511                 the BB of the related stmt is inside this loop.  */
8512               gimple_set_bb (stmt,
8513                              gimple_bb (new_stmt));
8514               related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
8515               gcc_assert (related_vinfo == NULL
8516                           || related_vinfo == stmt_vinfo);
8517             }
8518         }
8519     }
8520
8521   /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
8522      using the original main loop and thus need to be updated to refer to the
8523      cloned variables used in the epilogue.  */
8524   for (unsigned i = 0; i < stmt_worklist.length (); ++i)
8525     {
8526       gimple *stmt = stmt_worklist[i];
8527       tree *new_op;
8528
8529       for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
8530         {
8531           tree op = gimple_op (stmt, j);
8532           if ((new_op = mapping.get(op)))
8533             gimple_set_op (stmt, j, *new_op);
8534           else
8535             {
8536               /* PR92429: The last argument of simplify_replace_tree disables
8537                  folding when replacing arguments.  This is required as
8538                  otherwise you might end up with different statements than the
8539                  ones analyzed in vect_loop_analyze, leading to different
8540                  vectorization.  */
8541               op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
8542                                           &find_in_mapping, &mapping, false);
8543               gimple_set_op (stmt, j, op);
8544             }
8545         }
8546     }
8547
8548   struct data_reference *dr;
8549   vec<data_reference_p> datarefs = epilogue_vinfo->shared->datarefs;
8550   FOR_EACH_VEC_ELT (datarefs, i, dr)
8551     {
8552       orig_stmt = DR_STMT (dr);
8553       gcc_assert (gimple_uid (orig_stmt) > 0);
8554       stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
8555       /* Data references for gather loads and scatter stores do not use the
8556          updated offset we set using ADVANCE.  Instead we have to make sure the
8557          reference in the data references point to the corresponding copy of
8558          the original in the epilogue.  */
8559       if (STMT_VINFO_MEMORY_ACCESS_TYPE (vect_stmt_to_vectorize (stmt_vinfo))
8560           == VMAT_GATHER_SCATTER)
8561         {
8562           DR_REF (dr)
8563             = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
8564                                      &find_in_mapping, &mapping);
8565           DR_BASE_ADDRESS (dr)
8566             = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
8567                                      &find_in_mapping, &mapping);
8568         }
8569       DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
8570       stmt_vinfo->dr_aux.stmt = stmt_vinfo;
8571       /* The vector size of the epilogue is smaller than that of the main loop
8572          so the alignment is either the same or lower. This means the dr will
8573          thus by definition be aligned.  */
8574       STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false;
8575     }
8576
8577   epilogue_vinfo->shared->datarefs_copy.release ();
8578   epilogue_vinfo->shared->save_datarefs ();
8579 }
8580
8581 /* Function vect_transform_loop.
8582
8583    The analysis phase has determined that the loop is vectorizable.
8584    Vectorize the loop - created vectorized stmts to replace the scalar
8585    stmts in the loop, and update the loop exit condition.
8586    Returns scalar epilogue loop if any.  */
8587
8588 class loop *
8589 vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
8590 {
8591   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8592   class loop *epilogue = NULL;
8593   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8594   int nbbs = loop->num_nodes;
8595   int i;
8596   tree niters_vector = NULL_TREE;
8597   tree step_vector = NULL_TREE;
8598   tree niters_vector_mult_vf = NULL_TREE;
8599   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8600   unsigned int lowest_vf = constant_lower_bound (vf);
8601   gimple *stmt;
8602   bool check_profitability = false;
8603   unsigned int th;
8604
8605   DUMP_VECT_SCOPE ("vec_transform_loop");
8606
8607   loop_vinfo->shared->check_datarefs ();
8608
8609   /* Use the more conservative vectorization threshold.  If the number
8610      of iterations is constant assume the cost check has been performed
8611      by our caller.  If the threshold makes all loops profitable that
8612      run at least the (estimated) vectorization factor number of times
8613      checking is pointless, too.  */
8614   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
8615   if (vect_apply_runtime_profitability_check_p (loop_vinfo))
8616     {
8617       if (dump_enabled_p ())
8618         dump_printf_loc (MSG_NOTE, vect_location,
8619                          "Profitability threshold is %d loop iterations.\n",
8620                          th);
8621       check_profitability = true;
8622     }
8623
8624   /* Make sure there exists a single-predecessor exit bb.  Do this before
8625      versioning.   */
8626   edge e = single_exit (loop);
8627   if (! single_pred_p (e->dest))
8628     {
8629       split_loop_exit_edge (e, true);
8630       if (dump_enabled_p ())
8631         dump_printf (MSG_NOTE, "split exit edge\n");
8632     }
8633
8634   /* Version the loop first, if required, so the profitability check
8635      comes first.  */
8636
8637   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
8638     {
8639       class loop *sloop
8640         = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
8641       sloop->force_vectorize = false;
8642       check_profitability = false;
8643     }
8644
8645   /* Make sure there exists a single-predecessor exit bb also on the
8646      scalar loop copy.  Do this after versioning but before peeling
8647      so CFG structure is fine for both scalar and if-converted loop
8648      to make slpeel_duplicate_current_defs_from_edges face matched
8649      loop closed PHI nodes on the exit.  */
8650   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8651     {
8652       e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
8653       if (! single_pred_p (e->dest))
8654         {
8655           split_loop_exit_edge (e, true);
8656           if (dump_enabled_p ())
8657             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
8658         }
8659     }
8660
8661   tree niters = vect_build_loop_niters (loop_vinfo);
8662   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
8663   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
8664   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
8665   tree advance;
8666   drs_init_vec orig_drs_init;
8667
8668   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
8669                               &step_vector, &niters_vector_mult_vf, th,
8670                               check_profitability, niters_no_overflow,
8671                               &advance);
8672
8673   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
8674       && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
8675     scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
8676                             LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
8677
8678   if (niters_vector == NULL_TREE)
8679     {
8680       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8681           && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8682           && known_eq (lowest_vf, vf))
8683         {
8684           niters_vector
8685             = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
8686                              LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
8687           step_vector = build_one_cst (TREE_TYPE (niters));
8688         }
8689       else
8690         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
8691                                      &step_vector, niters_no_overflow);
8692     }
8693
8694   /* 1) Make sure the loop header has exactly two entries
8695      2) Make sure we have a preheader basic block.  */
8696
8697   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
8698
8699   split_edge (loop_preheader_edge (loop));
8700
8701   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8702       && vect_use_loop_mask_for_alignment_p (loop_vinfo))
8703     /* This will deal with any possible peeling.  */
8704     vect_prepare_for_masked_peels (loop_vinfo);
8705
8706   /* Schedule the SLP instances first, then handle loop vectorization
8707      below.  */
8708   if (!loop_vinfo->slp_instances.is_empty ())
8709     {
8710       DUMP_VECT_SCOPE ("scheduling SLP instances");
8711       vect_schedule_slp (loop_vinfo);
8712     }
8713
8714   /* FORNOW: the vectorizer supports only loops which body consist
8715      of one basic block (header + empty latch). When the vectorizer will
8716      support more involved loop forms, the order by which the BBs are
8717      traversed need to be reconsidered.  */
8718
8719   for (i = 0; i < nbbs; i++)
8720     {
8721       basic_block bb = bbs[i];
8722       stmt_vec_info stmt_info;
8723
8724       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
8725            gsi_next (&si))
8726         {
8727           gphi *phi = si.phi ();
8728           if (dump_enabled_p ())
8729             dump_printf_loc (MSG_NOTE, vect_location,
8730                              "------>vectorizing phi: %G", phi);
8731           stmt_info = loop_vinfo->lookup_stmt (phi);
8732           if (!stmt_info)
8733             continue;
8734
8735           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8736             vect_loop_kill_debug_uses (loop, stmt_info);
8737
8738           if (!STMT_VINFO_RELEVANT_P (stmt_info)
8739               && !STMT_VINFO_LIVE_P (stmt_info))
8740             continue;
8741
8742           if (STMT_VINFO_VECTYPE (stmt_info)
8743               && (maybe_ne
8744                   (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
8745               && dump_enabled_p ())
8746             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8747
8748           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
8749                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8750                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
8751                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
8752                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
8753               && ! PURE_SLP_STMT (stmt_info))
8754             {
8755               if (dump_enabled_p ())
8756                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
8757               vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL);
8758             }
8759         }
8760
8761       for (gimple_stmt_iterator si = gsi_start_bb (bb);
8762            !gsi_end_p (si);)
8763         {
8764           stmt = gsi_stmt (si);
8765           /* During vectorization remove existing clobber stmts.  */
8766           if (gimple_clobber_p (stmt))
8767             {
8768               unlink_stmt_vdef (stmt);
8769               gsi_remove (&si, true);
8770               release_defs (stmt);
8771             }
8772           else
8773             {
8774               stmt_info = loop_vinfo->lookup_stmt (stmt);
8775
8776               /* vector stmts created in the outer-loop during vectorization of
8777                  stmts in an inner-loop may not have a stmt_info, and do not
8778                  need to be vectorized.  */
8779               stmt_vec_info seen_store = NULL;
8780               if (stmt_info)
8781                 {
8782                   if (STMT_VINFO_IN_PATTERN_P (stmt_info))
8783                     {
8784                       gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
8785                       for (gimple_stmt_iterator subsi = gsi_start (def_seq);
8786                            !gsi_end_p (subsi); gsi_next (&subsi))
8787                         {
8788                           stmt_vec_info pat_stmt_info
8789                             = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
8790                           vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
8791                                                     &si, &seen_store);
8792                         }
8793                       stmt_vec_info pat_stmt_info
8794                         = STMT_VINFO_RELATED_STMT (stmt_info);
8795                       vect_transform_loop_stmt (loop_vinfo, pat_stmt_info, &si,
8796                                                 &seen_store);
8797                     }
8798                   vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
8799                                             &seen_store);
8800                 }
8801               gsi_next (&si);
8802               if (seen_store)
8803                 {
8804                   if (STMT_VINFO_GROUPED_ACCESS (seen_store))
8805                     /* Interleaving.  If IS_STORE is TRUE, the
8806                        vectorization of the interleaving chain was
8807                        completed - free all the stores in the chain.  */
8808                     vect_remove_stores (loop_vinfo,
8809                                         DR_GROUP_FIRST_ELEMENT (seen_store));
8810                   else
8811                     /* Free the attached stmt_vec_info and remove the stmt.  */
8812                     loop_vinfo->remove_stmt (stmt_info);
8813                 }
8814             }
8815         }
8816
8817       /* Stub out scalar statements that must not survive vectorization.
8818          Doing this here helps with grouped statements, or statements that
8819          are involved in patterns.  */
8820       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
8821            !gsi_end_p (gsi); gsi_next (&gsi))
8822         {
8823           gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
8824           if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
8825             {
8826               tree lhs = gimple_get_lhs (call);
8827               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8828                 {
8829                   tree zero = build_zero_cst (TREE_TYPE (lhs));
8830                   gimple *new_stmt = gimple_build_assign (lhs, zero);
8831                   gsi_replace (&gsi, new_stmt, true);
8832                 }
8833             }
8834         }
8835     }                           /* BBs in loop */
8836
8837   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
8838      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
8839   if (integer_onep (step_vector))
8840     niters_no_overflow = true;
8841   vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
8842                            niters_vector_mult_vf, !niters_no_overflow);
8843
8844   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
8845   scale_profile_for_vect_loop (loop, assumed_vf);
8846
8847   /* True if the final iteration might not handle a full vector's
8848      worth of scalar iterations.  */
8849   bool final_iter_may_be_partial = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8850   /* The minimum number of iterations performed by the epilogue.  This
8851      is 1 when peeling for gaps because we always need a final scalar
8852      iteration.  */
8853   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
8854   /* +1 to convert latch counts to loop iteration counts,
8855      -min_epilogue_iters to remove iterations that cannot be performed
8856        by the vector code.  */
8857   int bias_for_lowest = 1 - min_epilogue_iters;
8858   int bias_for_assumed = bias_for_lowest;
8859   int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
8860   if (alignment_npeels && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8861     {
8862       /* When the amount of peeling is known at compile time, the first
8863          iteration will have exactly alignment_npeels active elements.
8864          In the worst case it will have at least one.  */
8865       int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
8866       bias_for_lowest += lowest_vf - min_first_active;
8867       bias_for_assumed += assumed_vf - min_first_active;
8868     }
8869   /* In these calculations the "- 1" converts loop iteration counts
8870      back to latch counts.  */
8871   if (loop->any_upper_bound)
8872     loop->nb_iterations_upper_bound
8873       = (final_iter_may_be_partial
8874          ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
8875                           lowest_vf) - 1
8876          : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
8877                            lowest_vf) - 1);
8878   if (loop->any_likely_upper_bound)
8879     loop->nb_iterations_likely_upper_bound
8880       = (final_iter_may_be_partial
8881          ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
8882                           + bias_for_lowest, lowest_vf) - 1
8883          : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
8884                            + bias_for_lowest, lowest_vf) - 1);
8885   if (loop->any_estimate)
8886     loop->nb_iterations_estimate
8887       = (final_iter_may_be_partial
8888          ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
8889                           assumed_vf) - 1
8890          : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
8891                            assumed_vf) - 1);
8892
8893   if (dump_enabled_p ())
8894     {
8895       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8896         {
8897           dump_printf_loc (MSG_NOTE, vect_location,
8898                            "LOOP VECTORIZED\n");
8899           if (loop->inner)
8900             dump_printf_loc (MSG_NOTE, vect_location,
8901                              "OUTER LOOP VECTORIZED\n");
8902           dump_printf (MSG_NOTE, "\n");
8903         }
8904       else
8905         dump_printf_loc (MSG_NOTE, vect_location,
8906                          "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
8907                          GET_MODE_NAME (loop_vinfo->vector_mode));
8908     }
8909
8910   /* Loops vectorized with a variable factor won't benefit from
8911      unrolling/peeling.  */
8912   if (!vf.is_constant ())
8913     {
8914       loop->unroll = 1;
8915       if (dump_enabled_p ())
8916         dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
8917                          " variable-length vectorization factor\n");
8918     }
8919   /* Free SLP instances here because otherwise stmt reference counting
8920      won't work.  */
8921   slp_instance instance;
8922   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
8923     vect_free_slp_instance (instance, true);
8924   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
8925   /* Clear-up safelen field since its value is invalid after vectorization
8926      since vectorized loop can have loop-carried dependencies.  */
8927   loop->safelen = 0;
8928
8929   if (epilogue)
8930     {
8931       update_epilogue_loop_vinfo (epilogue, advance);
8932
8933       epilogue->simduid = loop->simduid;
8934       epilogue->force_vectorize = loop->force_vectorize;
8935       epilogue->dont_vectorize = false;
8936     }
8937
8938   return epilogue;
8939 }
8940
8941 /* The code below is trying to perform simple optimization - revert
8942    if-conversion for masked stores, i.e. if the mask of a store is zero
8943    do not perform it and all stored value producers also if possible.
8944    For example,
8945      for (i=0; i<n; i++)
8946        if (c[i])
8947         {
8948           p1[i] += 1;
8949           p2[i] = p3[i] +2;
8950         }
8951    this transformation will produce the following semi-hammock:
8952
8953    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
8954      {
8955        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
8956        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
8957        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
8958        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
8959        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
8960        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
8961      }
8962 */
8963
8964 void
8965 optimize_mask_stores (class loop *loop)
8966 {
8967   basic_block *bbs = get_loop_body (loop);
8968   unsigned nbbs = loop->num_nodes;
8969   unsigned i;
8970   basic_block bb;
8971   class loop *bb_loop;
8972   gimple_stmt_iterator gsi;
8973   gimple *stmt;
8974   auto_vec<gimple *> worklist;
8975   auto_purge_vect_location sentinel;
8976
8977   vect_location = find_loop_location (loop);
8978   /* Pick up all masked stores in loop if any.  */
8979   for (i = 0; i < nbbs; i++)
8980     {
8981       bb = bbs[i];
8982       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
8983            gsi_next (&gsi))
8984         {
8985           stmt = gsi_stmt (gsi);
8986           if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
8987             worklist.safe_push (stmt);
8988         }
8989     }
8990
8991   free (bbs);
8992   if (worklist.is_empty ())
8993     return;
8994
8995   /* Loop has masked stores.  */
8996   while (!worklist.is_empty ())
8997     {
8998       gimple *last, *last_store;
8999       edge e, efalse;
9000       tree mask;
9001       basic_block store_bb, join_bb;
9002       gimple_stmt_iterator gsi_to;
9003       tree vdef, new_vdef;
9004       gphi *phi;
9005       tree vectype;
9006       tree zero;
9007
9008       last = worklist.pop ();
9009       mask = gimple_call_arg (last, 2);
9010       bb = gimple_bb (last);
9011       /* Create then_bb and if-then structure in CFG, then_bb belongs to
9012          the same loop as if_bb.  It could be different to LOOP when two
9013          level loop-nest is vectorized and mask_store belongs to the inner
9014          one.  */
9015       e = split_block (bb, last);
9016       bb_loop = bb->loop_father;
9017       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
9018       join_bb = e->dest;
9019       store_bb = create_empty_bb (bb);
9020       add_bb_to_loop (store_bb, bb_loop);
9021       e->flags = EDGE_TRUE_VALUE;
9022       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
9023       /* Put STORE_BB to likely part.  */
9024       efalse->probability = profile_probability::unlikely ();
9025       store_bb->count = efalse->count ();
9026       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
9027       if (dom_info_available_p (CDI_DOMINATORS))
9028         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
9029       if (dump_enabled_p ())
9030         dump_printf_loc (MSG_NOTE, vect_location,
9031                          "Create new block %d to sink mask stores.",
9032                          store_bb->index);
9033       /* Create vector comparison with boolean result.  */
9034       vectype = TREE_TYPE (mask);
9035       zero = build_zero_cst (vectype);
9036       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
9037       gsi = gsi_last_bb (bb);
9038       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
9039       /* Create new PHI node for vdef of the last masked store:
9040          .MEM_2 = VDEF <.MEM_1>
9041          will be converted to
9042          .MEM.3 = VDEF <.MEM_1>
9043          and new PHI node will be created in join bb
9044          .MEM_2 = PHI <.MEM_1, .MEM_3>
9045       */
9046       vdef = gimple_vdef (last);
9047       new_vdef = make_ssa_name (gimple_vop (cfun), last);
9048       gimple_set_vdef (last, new_vdef);
9049       phi = create_phi_node (vdef, join_bb);
9050       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
9051
9052       /* Put all masked stores with the same mask to STORE_BB if possible.  */
9053       while (true)
9054         {
9055           gimple_stmt_iterator gsi_from;
9056           gimple *stmt1 = NULL;
9057
9058           /* Move masked store to STORE_BB.  */
9059           last_store = last;
9060           gsi = gsi_for_stmt (last);
9061           gsi_from = gsi;
9062           /* Shift GSI to the previous stmt for further traversal.  */
9063           gsi_prev (&gsi);
9064           gsi_to = gsi_start_bb (store_bb);
9065           gsi_move_before (&gsi_from, &gsi_to);
9066           /* Setup GSI_TO to the non-empty block start.  */
9067           gsi_to = gsi_start_bb (store_bb);
9068           if (dump_enabled_p ())
9069             dump_printf_loc (MSG_NOTE, vect_location,
9070                              "Move stmt to created bb\n%G", last);
9071           /* Move all stored value producers if possible.  */
9072           while (!gsi_end_p (gsi))
9073             {
9074               tree lhs;
9075               imm_use_iterator imm_iter;
9076               use_operand_p use_p;
9077               bool res;
9078
9079               /* Skip debug statements.  */
9080               if (is_gimple_debug (gsi_stmt (gsi)))
9081                 {
9082                   gsi_prev (&gsi);
9083                   continue;
9084                 }
9085               stmt1 = gsi_stmt (gsi);
9086               /* Do not consider statements writing to memory or having
9087                  volatile operand.  */
9088               if (gimple_vdef (stmt1)
9089                   || gimple_has_volatile_ops (stmt1))
9090                 break;
9091               gsi_from = gsi;
9092               gsi_prev (&gsi);
9093               lhs = gimple_get_lhs (stmt1);
9094               if (!lhs)
9095                 break;
9096
9097               /* LHS of vectorized stmt must be SSA_NAME.  */
9098               if (TREE_CODE (lhs) != SSA_NAME)
9099                 break;
9100
9101               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9102                 {
9103                   /* Remove dead scalar statement.  */
9104                   if (has_zero_uses (lhs))
9105                     {
9106                       gsi_remove (&gsi_from, true);
9107                       continue;
9108                     }
9109                 }
9110
9111               /* Check that LHS does not have uses outside of STORE_BB.  */
9112               res = true;
9113               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
9114                 {
9115                   gimple *use_stmt;
9116                   use_stmt = USE_STMT (use_p);
9117                   if (is_gimple_debug (use_stmt))
9118                     continue;
9119                   if (gimple_bb (use_stmt) != store_bb)
9120                     {
9121                       res = false;
9122                       break;
9123                     }
9124                 }
9125               if (!res)
9126                 break;
9127
9128               if (gimple_vuse (stmt1)
9129                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
9130                 break;
9131
9132               /* Can move STMT1 to STORE_BB.  */
9133               if (dump_enabled_p ())
9134                 dump_printf_loc (MSG_NOTE, vect_location,
9135                                  "Move stmt to created bb\n%G", stmt1);
9136               gsi_move_before (&gsi_from, &gsi_to);
9137               /* Shift GSI_TO for further insertion.  */
9138               gsi_prev (&gsi_to);
9139             }
9140           /* Put other masked stores with the same mask to STORE_BB.  */
9141           if (worklist.is_empty ()
9142               || gimple_call_arg (worklist.last (), 2) != mask
9143               || worklist.last () != stmt1)
9144             break;
9145           last = worklist.pop ();
9146         }
9147       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
9148     }
9149 }
9150
9151 /* Decide whether it is possible to use a zero-based induction variable
9152    when vectorizing LOOP_VINFO with a fully-masked loop.  If it is,
9153    return the value that the induction variable must be able to hold
9154    in order to ensure that the loop ends with an all-false mask.
9155    Return -1 otherwise.  */
9156 widest_int
9157 vect_iv_limit_for_full_masking (loop_vec_info loop_vinfo)
9158 {
9159   tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9160   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9161   unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
9162
9163   /* Calculate the value that the induction variable must be able
9164      to hit in order to ensure that we end the loop with an all-false mask.
9165      This involves adding the maximum number of inactive trailing scalar
9166      iterations.  */
9167   widest_int iv_limit = -1;
9168   if (max_loop_iterations (loop, &iv_limit))
9169     {
9170       if (niters_skip)
9171         {
9172           /* Add the maximum number of skipped iterations to the
9173              maximum iteration count.  */
9174           if (TREE_CODE (niters_skip) == INTEGER_CST)
9175             iv_limit += wi::to_widest (niters_skip);
9176           else
9177             iv_limit += max_vf - 1;
9178         }
9179       else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
9180         /* Make a conservatively-correct assumption.  */
9181         iv_limit += max_vf - 1;
9182
9183       /* IV_LIMIT is the maximum number of latch iterations, which is also
9184          the maximum in-range IV value.  Round this value down to the previous
9185          vector alignment boundary and then add an extra full iteration.  */
9186       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9187       iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
9188     }
9189   return iv_limit;
9190 }
9191