gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2019 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "cfghooks.h"
  31 #include "tree-pass.h"
  32 #include "ssa.h"
  33 #include "optabs-tree.h"
  34 #include "diagnostic-core.h"
  35 #include "fold-const.h"
  36 #include "stor-layout.h"
  37 #include "cfganal.h"
  38 #include "gimplify.h"
  39 #include "gimple-iterator.h"
  40 #include "gimplify-me.h"
  41 #include "tree-ssa-loop-ivopts.h"
  42 #include "tree-ssa-loop-manip.h"
  43 #include "tree-ssa-loop-niter.h"
  44 #include "tree-ssa-loop.h"
  45 #include "cfgloop.h"
  46 #include "params.h"
  47 #include "tree-scalar-evolution.h"
  48 #include "tree-vectorizer.h"
  49 #include "gimple-fold.h"
  50 #include "cgraph.h"
  51 #include "tree-cfg.h"
  52 #include "tree-if-conv.h"
  53 #include "internal-fn.h"
  54 #include "tree-vector-builder.h"
  55 #include "vec-perm-indices.h"
  56 #include "tree-eh.h"
  57
  58 /* Loop Vectorization Pass.
  59
  60    This pass tries to vectorize loops.
  61
  62    For example, the vectorizer transforms the following simple loop:
  63
  64         short a[N]; short b[N]; short c[N]; int i;
  65
  66         for (i=0; i<N; i++){
  67           a[i] = b[i] + c[i];
  68         }
  69
  70    as if it was manually vectorized by rewriting the source code into:
  71
  72         typedef int __attribute__((mode(V8HI))) v8hi;
  73         short a[N];  short b[N]; short c[N];   int i;
  74         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  75         v8hi va, vb, vc;
  76
  77         for (i=0; i<N/8; i++){
  78           vb = pb[i];
  79           vc = pc[i];
  80           va = vb + vc;
  81           pa[i] = va;
  82         }
  83
  84         The main entry to this pass is vectorize_loops(), in which
  85    the vectorizer applies a set of analyses on a given set of loops,
  86    followed by the actual vectorization transformation for the loops that
  87    had successfully passed the analysis phase.
  88         Throughout this pass we make a distinction between two types of
  89    data: scalars (which are represented by SSA_NAMES), and memory references
  90    ("data-refs").  These two types of data require different handling both
  91    during analysis and transformation. The types of data-refs that the
  92    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  93    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  94    accesses are required to have a simple (consecutive) access pattern.
  95
  96    Analysis phase:
  97    ===============
  98         The driver for the analysis phase is vect_analyze_loop().
  99    It applies a set of analyses, some of which rely on the scalar evolution
 100    analyzer (scev) developed by Sebastian Pop.
 101
 102         During the analysis phase the vectorizer records some information
 103    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 104    loop, as well as general information about the loop as a whole, which is
 105    recorded in a "loop_vec_info" struct attached to each loop.
 106
 107    Transformation phase:
 108    =====================
 109         The loop transformation phase scans all the stmts in the loop, and
 110    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 111    the loop that needs to be vectorized.  It inserts the vector code sequence
 112    just before the scalar stmt S, and records a pointer to the vector code
 113    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 114    attached to S).  This pointer will be used for the vectorization of following
 115    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 116    otherwise, we rely on dead code elimination for removing it.
 117
 118         For example, say stmt S1 was vectorized into stmt VS1:
 119
 120    VS1: vb = px[i];
 121    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 122    S2:  a = b;
 123
 124    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 125    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 126    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 127    resulting sequence would be:
 128
 129    VS1: vb = px[i];
 130    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 131    VS2: va = vb;
 132    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 133
 134         Operands that are not SSA_NAMEs, are data-refs that appear in
 135    load/store operations (like 'x[i]' in S1), and are handled differently.
 136
 137    Target modeling:
 138    =================
 139         Currently the only target specific information that is used is the
 140    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 141    Targets that can support different sizes of vectors, for now will need
 142    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 143    flexibility will be added in the future.
 144
 145         Since we only vectorize operations which vector form can be
 146    expressed using existing tree codes, to verify that an operation is
 147    supported, the vectorizer checks the relevant optab at the relevant
 148    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 149    the value found is CODE_FOR_nothing, then there's no target support, and
 150    we can't vectorize the stmt.
 151
 152    For additional information on this project see:
 153    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 154 */
 155
 156 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 157
 158 /* Subroutine of vect_determine_vf_for_stmt that handles only one
 159    statement.  VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
 160    may already be set for general statements (not just data refs).  */
 161
 162 static opt_result
 163 vect_determine_vf_for_stmt_1 (stmt_vec_info stmt_info,
 164                               bool vectype_maybe_set_p,
 165                               poly_uint64 *vf,
 166                               vec<stmt_vec_info > *mask_producers)
 167 {
 168   gimple *stmt = stmt_info->stmt;
 169
 170   if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 171        && !STMT_VINFO_LIVE_P (stmt_info))
 172       || gimple_clobber_p (stmt))
 173     {
 174       if (dump_enabled_p ())
 175         dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 176       return opt_result::success ();
 177     }
 178
 179   tree stmt_vectype, nunits_vectype;
 180   opt_result res = vect_get_vector_types_for_stmt (stmt_info, &stmt_vectype,
 181                                                    &nunits_vectype);
 182   if (!res)
 183     return res;
 184
 185   if (stmt_vectype)
 186     {
 187       if (STMT_VINFO_VECTYPE (stmt_info))
 188         /* The only case when a vectype had been already set is for stmts
 189            that contain a data ref, or for "pattern-stmts" (stmts generated
 190            by the vectorizer to represent/replace a certain idiom).  */
 191         gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
 192                      || vectype_maybe_set_p)
 193                     && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
 194       else if (stmt_vectype == boolean_type_node)
 195         mask_producers->safe_push (stmt_info);
 196       else
 197         STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
 198     }
 199
 200   if (nunits_vectype)
 201     vect_update_max_nunits (vf, nunits_vectype);
 202
 203   return opt_result::success ();
 204 }
 205
 206 /* Subroutine of vect_determine_vectorization_factor.  Set the vector
 207    types of STMT_INFO and all attached pattern statements and update
 208    the vectorization factor VF accordingly.  If some of the statements
 209    produce a mask result whose vector type can only be calculated later,
 210    add them to MASK_PRODUCERS.  Return true on success or false if
 211    something prevented vectorization.  */
 212
 213 static opt_result
 214 vect_determine_vf_for_stmt (stmt_vec_info stmt_info, poly_uint64 *vf,
 215                             vec<stmt_vec_info > *mask_producers)
 216 {
 217   vec_info *vinfo = stmt_info->vinfo;
 218   if (dump_enabled_p ())
 219     dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
 220                      stmt_info->stmt);
 221   opt_result res
 222     = vect_determine_vf_for_stmt_1 (stmt_info, false, vf, mask_producers);
 223   if (!res)
 224     return res;
 225
 226   if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 227       && STMT_VINFO_RELATED_STMT (stmt_info))
 228     {
 229       gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 230       stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
 231
 232       /* If a pattern statement has def stmts, analyze them too.  */
 233       for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
 234            !gsi_end_p (si); gsi_next (&si))
 235         {
 236           stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
 237           if (dump_enabled_p ())
 238             dump_printf_loc (MSG_NOTE, vect_location,
 239                              "==> examining pattern def stmt: %G",
 240                              def_stmt_info->stmt);
 241           if (!vect_determine_vf_for_stmt_1 (def_stmt_info, true,
 242                                              vf, mask_producers))
 243           res = vect_determine_vf_for_stmt_1 (def_stmt_info, true,
 244                                               vf, mask_producers);
 245           if (!res)
 246             return res;
 247         }
 248
 249       if (dump_enabled_p ())
 250         dump_printf_loc (MSG_NOTE, vect_location,
 251                          "==> examining pattern statement: %G",
 252                          stmt_info->stmt);
 253       res = vect_determine_vf_for_stmt_1 (stmt_info, true, vf, mask_producers);
 254       if (!res)
 255         return res;
 256     }
 257
 258   return opt_result::success ();
 259 }
 260
 261 /* Function vect_determine_vectorization_factor
 262
 263    Determine the vectorization factor (VF).  VF is the number of data elements
 264    that are operated upon in parallel in a single iteration of the vectorized
 265    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 266    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 267    elements can fit in a single vector register.
 268
 269    We currently support vectorization of loops in which all types operated upon
 270    are of the same size.  Therefore this function currently sets VF according to
 271    the size of the types operated upon, and fails if there are multiple sizes
 272    in the loop.
 273
 274    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 275    original loop:
 276         for (i=0; i<N; i++){
 277           a[i] = b[i] + c[i];
 278         }
 279
 280    vectorized loop:
 281         for (i=0; i<N; i+=VF){
 282           a[i:VF] = b[i:VF] + c[i:VF];
 283         }
 284 */
 285
 286 static opt_result
 287 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 288 {
 289   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 290   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 291   unsigned nbbs = loop->num_nodes;
 292   poly_uint64 vectorization_factor = 1;
 293   tree scalar_type = NULL_TREE;
 294   gphi *phi;
 295   tree vectype;
 296   stmt_vec_info stmt_info;
 297   unsigned i;
 298   auto_vec<stmt_vec_info> mask_producers;
 299
 300   DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
 301
 302   for (i = 0; i < nbbs; i++)
 303     {
 304       basic_block bb = bbs[i];
 305
 306       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 307            gsi_next (&si))
 308         {
 309           phi = si.phi ();
 310           stmt_info = loop_vinfo->lookup_stmt (phi);
 311           if (dump_enabled_p ())
 312             dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
 313                              phi);
 314
 315           gcc_assert (stmt_info);
 316
 317           if (STMT_VINFO_RELEVANT_P (stmt_info)
 318               || STMT_VINFO_LIVE_P (stmt_info))
 319             {
 320               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 321               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 322
 323               if (dump_enabled_p ())
 324                 dump_printf_loc (MSG_NOTE, vect_location,
 325                                  "get vectype for scalar type:  %T\n",
 326                                  scalar_type);
 327
 328               vectype = get_vectype_for_scalar_type (scalar_type);
 329               if (!vectype)
 330                 return opt_result::failure_at (phi,
 331                                                "not vectorized: unsupported "
 332                                                "data-type %T\n",
 333                                                scalar_type);
 334               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 335
 336               if (dump_enabled_p ())
 337                 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
 338                                  vectype);
 339
 340               if (dump_enabled_p ())
 341                 {
 342                   dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
 343                   dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
 344                   dump_printf (MSG_NOTE, "\n");
 345                 }
 346
 347               vect_update_max_nunits (&vectorization_factor, vectype);
 348             }
 349         }
 350
 351       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
 352            gsi_next (&si))
 353         {
 354           stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
 355           opt_result res
 356             = vect_determine_vf_for_stmt (stmt_info, &vectorization_factor,
 357                                           &mask_producers);
 358           if (!res)
 359             return res;
 360         }
 361     }
 362
 363   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 364   if (dump_enabled_p ())
 365     {
 366       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
 367       dump_dec (MSG_NOTE, vectorization_factor);
 368       dump_printf (MSG_NOTE, "\n");
 369     }
 370
 371   if (known_le (vectorization_factor, 1U))
 372     return opt_result::failure_at (vect_location,
 373                                    "not vectorized: unsupported data-type\n");
 374   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 375
 376   for (i = 0; i < mask_producers.length (); i++)
 377     {
 378       stmt_info = mask_producers[i];
 379       opt_tree mask_type = vect_get_mask_type_for_stmt (stmt_info);
 380       if (!mask_type)
 381         return opt_result::propagate_failure (mask_type);
 382       STMT_VINFO_VECTYPE (stmt_info) = mask_type;
 383     }
 384
 385   return opt_result::success ();
 386 }
 387
 388
 389 /* Function vect_is_simple_iv_evolution.
 390
 391    FORNOW: A simple evolution of an induction variables in the loop is
 392    considered a polynomial evolution.  */
 393
 394 static bool
 395 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 396                              tree * step)
 397 {
 398   tree init_expr;
 399   tree step_expr;
 400   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 401   basic_block bb;
 402
 403   /* When there is no evolution in this loop, the evolution function
 404      is not "simple".  */
 405   if (evolution_part == NULL_TREE)
 406     return false;
 407
 408   /* When the evolution is a polynomial of degree >= 2
 409      the evolution function is not "simple".  */
 410   if (tree_is_chrec (evolution_part))
 411     return false;
 412
 413   step_expr = evolution_part;
 414   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 415
 416   if (dump_enabled_p ())
 417     dump_printf_loc (MSG_NOTE, vect_location, "step: %T,  init: %T\n",
 418                      step_expr, init_expr);
 419
 420   *init = init_expr;
 421   *step = step_expr;
 422
 423   if (TREE_CODE (step_expr) != INTEGER_CST
 424       && (TREE_CODE (step_expr) != SSA_NAME
 425           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 426               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 427           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 428               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 429                   || !flag_associative_math)))
 430       && (TREE_CODE (step_expr) != REAL_CST
 431           || !flag_associative_math))
 432     {
 433       if (dump_enabled_p ())
 434         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 435                          "step unknown.\n");
 436       return false;
 437     }
 438
 439   return true;
 440 }
 441
 442 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
 443    what we are assuming is a double reduction.  For example, given
 444    a structure like this:
 445
 446       outer1:
 447         x_1 = PHI <x_4(outer2), ...>;
 448         ...
 449
 450       inner:
 451         x_2 = PHI <x_1(outer1), ...>;
 452         ...
 453         x_3 = ...;
 454         ...
 455
 456       outer2:
 457         x_4 = PHI <x_3(inner)>;
 458         ...
 459
 460    outer loop analysis would treat x_1 as a double reduction phi and
 461    this function would then return true for x_2.  */
 462
 463 static bool
 464 vect_inner_phi_in_double_reduction_p (stmt_vec_info stmt_info, gphi *phi)
 465 {
 466   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
 467   use_operand_p use_p;
 468   ssa_op_iter op_iter;
 469   FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
 470     if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
 471       if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
 472         return true;
 473   return false;
 474 }
 475
 476 /* Function vect_analyze_scalar_cycles_1.
 477
 478    Examine the cross iteration def-use cycles of scalar variables
 479    in LOOP.  LOOP_VINFO represents the loop that is now being
 480    considered for vectorization (can be LOOP, or an outer-loop
 481    enclosing LOOP).  */
 482
 483 static void
 484 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 485 {
 486   basic_block bb = loop->header;
 487   tree init, step;
 488   auto_vec<stmt_vec_info, 64> worklist;
 489   gphi_iterator gsi;
 490   bool double_reduc;
 491
 492   DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
 493
 494   /* First - identify all inductions.  Reduction detection assumes that all the
 495      inductions have been identified, therefore, this order must not be
 496      changed.  */
 497   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 498     {
 499       gphi *phi = gsi.phi ();
 500       tree access_fn = NULL;
 501       tree def = PHI_RESULT (phi);
 502       stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
 503
 504       if (dump_enabled_p ())
 505         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
 506
 507       /* Skip virtual phi's.  The data dependences that are associated with
 508          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 509       if (virtual_operand_p (def))
 510         continue;
 511
 512       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 513
 514       /* Analyze the evolution function.  */
 515       access_fn = analyze_scalar_evolution (loop, def);
 516       if (access_fn)
 517         {
 518           STRIP_NOPS (access_fn);
 519           if (dump_enabled_p ())
 520             dump_printf_loc (MSG_NOTE, vect_location,
 521                              "Access function of PHI: %T\n", access_fn);
 522           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 523             = initial_condition_in_loop_num (access_fn, loop->num);
 524           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 525             = evolution_part_in_loop_num (access_fn, loop->num);
 526         }
 527
 528       if (!access_fn
 529           || vect_inner_phi_in_double_reduction_p (stmt_vinfo, phi)
 530           || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
 531           || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 532               && TREE_CODE (step) != INTEGER_CST))
 533         {
 534           worklist.safe_push (stmt_vinfo);
 535           continue;
 536         }
 537
 538       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 539                   != NULL_TREE);
 540       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 541
 542       if (dump_enabled_p ())
 543         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 544       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 545     }
 546
 547
 548   /* Second - identify all reductions and nested cycles.  */
 549   while (worklist.length () > 0)
 550     {
 551       stmt_vec_info stmt_vinfo = worklist.pop ();
 552       gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
 553       tree def = PHI_RESULT (phi);
 554
 555       if (dump_enabled_p ())
 556         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
 557
 558       gcc_assert (!virtual_operand_p (def)
 559                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 560
 561       stmt_vec_info reduc_stmt_info
 562         = vect_force_simple_reduction (loop_vinfo, stmt_vinfo,
 563                                        &double_reduc, false);
 564       if (reduc_stmt_info)
 565         {
 566           if (double_reduc)
 567             {
 568               if (dump_enabled_p ())
 569                 dump_printf_loc (MSG_NOTE, vect_location,
 570                                  "Detected double reduction.\n");
 571
 572               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 573               STMT_VINFO_DEF_TYPE (reduc_stmt_info)
 574                 = vect_double_reduction_def;
 575             }
 576           else
 577             {
 578               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
 579                 {
 580                   if (dump_enabled_p ())
 581                     dump_printf_loc (MSG_NOTE, vect_location,
 582                                      "Detected vectorizable nested cycle.\n");
 583
 584                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 585                   STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_nested_cycle;
 586                 }
 587               else
 588                 {
 589                   if (dump_enabled_p ())
 590                     dump_printf_loc (MSG_NOTE, vect_location,
 591                                      "Detected reduction.\n");
 592
 593                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 594                   STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
 595                   /* Store the reduction cycles for possible vectorization in
 596                      loop-aware SLP if it was not detected as reduction
 597                      chain.  */
 598                   if (! REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
 599                     LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
 600                       (reduc_stmt_info);
 601                 }
 602             }
 603         }
 604       else
 605         if (dump_enabled_p ())
 606           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 607                            "Unknown def-use cycle pattern.\n");
 608     }
 609 }
 610
 611
 612 /* Function vect_analyze_scalar_cycles.
 613
 614    Examine the cross iteration def-use cycles of scalar variables, by
 615    analyzing the loop-header PHIs of scalar variables.  Classify each
 616    cycle as one of the following: invariant, induction, reduction, unknown.
 617    We do that for the loop represented by LOOP_VINFO, and also to its
 618    inner-loop, if exists.
 619    Examples for scalar cycles:
 620
 621    Example1: reduction:
 622
 623               loop1:
 624               for (i=0; i<N; i++)
 625                  sum += a[i];
 626
 627    Example2: induction:
 628
 629               loop2:
 630               for (i=0; i<N; i++)
 631                  a[i] = i;  */
 632
 633 static void
 634 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 635 {
 636   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 637
 638   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 639
 640   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 641      Reductions in such inner-loop therefore have different properties than
 642      the reductions in the nest that gets vectorized:
 643      1. When vectorized, they are executed in the same order as in the original
 644         scalar loop, so we can't change the order of computation when
 645         vectorizing them.
 646      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 647         current checks are too strict.  */
 648
 649   if (loop->inner)
 650     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 651 }
 652
 653 /* Transfer group and reduction information from STMT_INFO to its
 654    pattern stmt.  */
 655
 656 static void
 657 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
 658 {
 659   stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
 660   stmt_vec_info stmtp;
 661   gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
 662               && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
 663   REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
 664   do
 665     {
 666       stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
 667       REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
 668       stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
 669       if (stmt_info)
 670         REDUC_GROUP_NEXT_ELEMENT (stmtp)
 671           = STMT_VINFO_RELATED_STMT (stmt_info);
 672     }
 673   while (stmt_info);
 674   STMT_VINFO_DEF_TYPE (stmtp) = vect_reduction_def;
 675 }
 676
 677 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 678
 679 static void
 680 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 681 {
 682   stmt_vec_info first;
 683   unsigned i;
 684
 685   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 686     if (STMT_VINFO_IN_PATTERN_P (first))
 687       {
 688         stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
 689         while (next)
 690           {
 691             if (! STMT_VINFO_IN_PATTERN_P (next))
 692               break;
 693             next = REDUC_GROUP_NEXT_ELEMENT (next);
 694           }
 695         /* If not all stmt in the chain are patterns try to handle
 696            the chain without patterns.  */
 697         if (! next)
 698           {
 699             vect_fixup_reduc_chain (first);
 700             LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
 701               = STMT_VINFO_RELATED_STMT (first);
 702           }
 703       }
 704 }
 705
 706 /* Function vect_get_loop_niters.
 707
 708    Determine how many iterations the loop is executed and place it
 709    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
 710    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
 711    niter information holds in ASSUMPTIONS.
 712
 713    Return the loop exit condition.  */
 714
 715
 716 static gcond *
 717 vect_get_loop_niters (struct loop *loop, tree *assumptions,
 718                       tree *number_of_iterations, tree *number_of_iterationsm1)
 719 {
 720   edge exit = single_exit (loop);
 721   struct tree_niter_desc niter_desc;
 722   tree niter_assumptions, niter, may_be_zero;
 723   gcond *cond = get_loop_exit_condition (loop);
 724
 725   *assumptions = boolean_true_node;
 726   *number_of_iterationsm1 = chrec_dont_know;
 727   *number_of_iterations = chrec_dont_know;
 728   DUMP_VECT_SCOPE ("get_loop_niters");
 729
 730   if (!exit)
 731     return cond;
 732
 733   niter = chrec_dont_know;
 734   may_be_zero = NULL_TREE;
 735   niter_assumptions = boolean_true_node;
 736   if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
 737       || chrec_contains_undetermined (niter_desc.niter))
 738     return cond;
 739
 740   niter_assumptions = niter_desc.assumptions;
 741   may_be_zero = niter_desc.may_be_zero;
 742   niter = niter_desc.niter;
 743
 744   if (may_be_zero && integer_zerop (may_be_zero))
 745     may_be_zero = NULL_TREE;
 746
 747   if (may_be_zero)
 748     {
 749       if (COMPARISON_CLASS_P (may_be_zero))
 750         {
 751           /* Try to combine may_be_zero with assumptions, this can simplify
 752              computation of niter expression.  */
 753           if (niter_assumptions && !integer_nonzerop (niter_assumptions))
 754             niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
 755                                              niter_assumptions,
 756                                              fold_build1 (TRUTH_NOT_EXPR,
 757                                                           boolean_type_node,
 758                                                           may_be_zero));
 759           else
 760             niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
 761                                  build_int_cst (TREE_TYPE (niter), 0),
 762                                  rewrite_to_non_trapping_overflow (niter));
 763
 764           may_be_zero = NULL_TREE;
 765         }
 766       else if (integer_nonzerop (may_be_zero))
 767         {
 768           *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
 769           *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
 770           return cond;
 771         }
 772       else
 773         return cond;
 774     }
 775
 776   *assumptions = niter_assumptions;
 777   *number_of_iterationsm1 = niter;
 778
 779   /* We want the number of loop header executions which is the number
 780      of latch executions plus one.
 781      ???  For UINT_MAX latch executions this number overflows to zero
 782      for loops like do { n++; } while (n != 0);  */
 783   if (niter && !chrec_contains_undetermined (niter))
 784     niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
 785                           build_int_cst (TREE_TYPE (niter), 1));
 786   *number_of_iterations = niter;
 787
 788   return cond;
 789 }
 790
 791 /* Function bb_in_loop_p
 792
 793    Used as predicate for dfs order traversal of the loop bbs.  */
 794
 795 static bool
 796 bb_in_loop_p (const_basic_block bb, const void *data)
 797 {
 798   const struct loop *const loop = (const struct loop *)data;
 799   if (flow_bb_inside_loop_p (loop, bb))
 800     return true;
 801   return false;
 802 }
 803
 804
 805 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
 806    stmt_vec_info structs for all the stmts in LOOP_IN.  */
 807
 808 _loop_vec_info::_loop_vec_info (struct loop *loop_in, vec_info_shared *shared)
 809   : vec_info (vec_info::loop, init_cost (loop_in), shared),
 810     loop (loop_in),
 811     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
 812     num_itersm1 (NULL_TREE),
 813     num_iters (NULL_TREE),
 814     num_iters_unchanged (NULL_TREE),
 815     num_iters_assumptions (NULL_TREE),
 816     th (0),
 817     versioning_threshold (0),
 818     vectorization_factor (0),
 819     max_vectorization_factor (0),
 820     mask_skip_niters (NULL_TREE),
 821     mask_compare_type (NULL_TREE),
 822     simd_if_cond (NULL_TREE),
 823     unaligned_dr (NULL),
 824     peeling_for_alignment (0),
 825     ptr_mask (0),
 826     ivexpr_map (NULL),
 827     scan_map (NULL),
 828     slp_unrolling_factor (1),
 829     single_scalar_iteration_cost (0),
 830     vectorizable (false),
 831     can_fully_mask_p (true),
 832     fully_masked_p (false),
 833     peeling_for_gaps (false),
 834     peeling_for_niter (false),
 835     operands_swapped (false),
 836     no_data_dependencies (false),
 837     has_mask_store (false),
 838     scalar_loop (NULL),
 839     orig_loop_info (NULL)
 840 {
 841   /* CHECKME: We want to visit all BBs before their successors (except for
 842      latch blocks, for which this assertion wouldn't hold).  In the simple
 843      case of the loop forms we allow, a dfs order of the BBs would the same
 844      as reversed postorder traversal, so we are safe.  */
 845
 846   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 847                                           bbs, loop->num_nodes, loop);
 848   gcc_assert (nbbs == loop->num_nodes);
 849
 850   for (unsigned int i = 0; i < nbbs; i++)
 851     {
 852       basic_block bb = bbs[i];
 853       gimple_stmt_iterator si;
 854
 855       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 856         {
 857           gimple *phi = gsi_stmt (si);
 858           gimple_set_uid (phi, 0);
 859           add_stmt (phi);
 860         }
 861
 862       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 863         {
 864           gimple *stmt = gsi_stmt (si);
 865           gimple_set_uid (stmt, 0);
 866           add_stmt (stmt);
 867           /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
 868              third argument is the #pragma omp simd if (x) condition, when 0,
 869              loop shouldn't be vectorized, when non-zero constant, it should
 870              be vectorized normally, otherwise versioned with vectorized loop
 871              done if the condition is non-zero at runtime.  */
 872           if (loop_in->simduid
 873               && is_gimple_call (stmt)
 874               && gimple_call_internal_p (stmt)
 875               && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
 876               && gimple_call_num_args (stmt) >= 3
 877               && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
 878               && (loop_in->simduid
 879                   == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
 880             {
 881               tree arg = gimple_call_arg (stmt, 2);
 882               if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
 883                 simd_if_cond = arg;
 884               else
 885                 gcc_assert (integer_nonzerop (arg));
 886             }
 887         }
 888     }
 889 }
 890
 891 /* Free all levels of MASKS.  */
 892
 893 void
 894 release_vec_loop_masks (vec_loop_masks *masks)
 895 {
 896   rgroup_masks *rgm;
 897   unsigned int i;
 898   FOR_EACH_VEC_ELT (*masks, i, rgm)
 899     rgm->masks.release ();
 900   masks->release ();
 901 }
 902
 903 /* Free all memory used by the _loop_vec_info, as well as all the
 904    stmt_vec_info structs of all the stmts in the loop.  */
 905
 906 _loop_vec_info::~_loop_vec_info ()
 907 {
 908   int nbbs;
 909   gimple_stmt_iterator si;
 910   int j;
 911
 912   nbbs = loop->num_nodes;
 913   for (j = 0; j < nbbs; j++)
 914     {
 915       basic_block bb = bbs[j];
 916       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
 917         {
 918           gimple *stmt = gsi_stmt (si);
 919
 920           /* We may have broken canonical form by moving a constant
 921              into RHS1 of a commutative op.  Fix such occurrences.  */
 922           if (operands_swapped && is_gimple_assign (stmt))
 923             {
 924               enum tree_code code = gimple_assign_rhs_code (stmt);
 925
 926               if ((code == PLUS_EXPR
 927                    || code == POINTER_PLUS_EXPR
 928                    || code == MULT_EXPR)
 929                   && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
 930                 swap_ssa_operands (stmt,
 931                                    gimple_assign_rhs1_ptr (stmt),
 932                                    gimple_assign_rhs2_ptr (stmt));
 933               else if (code == COND_EXPR
 934                        && CONSTANT_CLASS_P (gimple_assign_rhs2 (stmt)))
 935                 {
 936                   tree cond_expr = gimple_assign_rhs1 (stmt);
 937                   enum tree_code cond_code = TREE_CODE (cond_expr);
 938
 939                   if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
 940                     {
 941                       bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr,
 942                                                                   0));
 943                       cond_code = invert_tree_comparison (cond_code,
 944                                                           honor_nans);
 945                       if (cond_code != ERROR_MARK)
 946                         {
 947                           TREE_SET_CODE (cond_expr, cond_code);
 948                           swap_ssa_operands (stmt,
 949                                              gimple_assign_rhs2_ptr (stmt),
 950                                              gimple_assign_rhs3_ptr (stmt));
 951                         }
 952                     }
 953                 }
 954             }
 955           gsi_next (&si);
 956         }
 957     }
 958
 959   free (bbs);
 960
 961   release_vec_loop_masks (&masks);
 962   delete ivexpr_map;
 963   delete scan_map;
 964
 965   loop->aux = NULL;
 966 }
 967
 968 /* Return an invariant or register for EXPR and emit necessary
 969    computations in the LOOP_VINFO loop preheader.  */
 970
 971 tree
 972 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
 973 {
 974   if (is_gimple_reg (expr)
 975       || is_gimple_min_invariant (expr))
 976     return expr;
 977
 978   if (! loop_vinfo->ivexpr_map)
 979     loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
 980   tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
 981   if (! cached)
 982     {
 983       gimple_seq stmts = NULL;
 984       cached = force_gimple_operand (unshare_expr (expr),
 985                                      &stmts, true, NULL_TREE);
 986       if (stmts)
 987         {
 988           edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
 989           gsi_insert_seq_on_edge_immediate (e, stmts);
 990         }
 991     }
 992   return cached;
 993 }
 994
 995 /* Return true if we can use CMP_TYPE as the comparison type to produce
 996    all masks required to mask LOOP_VINFO.  */
 997
 998 static bool
 999 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
1000 {
1001   rgroup_masks *rgm;
1002   unsigned int i;
1003   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1004     if (rgm->mask_type != NULL_TREE
1005         && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
1006                                             cmp_type, rgm->mask_type,
1007                                             OPTIMIZE_FOR_SPEED))
1008       return false;
1009   return true;
1010 }
1011
1012 /* Calculate the maximum number of scalars per iteration for every
1013    rgroup in LOOP_VINFO.  */
1014
1015 static unsigned int
1016 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1017 {
1018   unsigned int res = 1;
1019   unsigned int i;
1020   rgroup_masks *rgm;
1021   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1022     res = MAX (res, rgm->max_nscalars_per_iter);
1023   return res;
1024 }
1025
1026 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1027    whether we can actually generate the masks required.  Return true if so,
1028    storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE.  */
1029
1030 static bool
1031 vect_verify_full_masking (loop_vec_info loop_vinfo)
1032 {
1033   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1034   unsigned int min_ni_width;
1035   unsigned int max_nscalars_per_iter
1036     = vect_get_max_nscalars_per_iter (loop_vinfo);
1037
1038   /* Use a normal loop if there are no statements that need masking.
1039      This only happens in rare degenerate cases: it means that the loop
1040      has no loads, no stores, and no live-out values.  */
1041   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1042     return false;
1043
1044   /* Get the maximum number of iterations that is representable
1045      in the counter type.  */
1046   tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1047   widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1048
1049   /* Get a more refined estimate for the number of iterations.  */
1050   widest_int max_back_edges;
1051   if (max_loop_iterations (loop, &max_back_edges))
1052     max_ni = wi::smin (max_ni, max_back_edges + 1);
1053
1054   /* Account for rgroup masks, in which each bit is replicated N times.  */
1055   max_ni *= max_nscalars_per_iter;
1056
1057   /* Work out how many bits we need to represent the limit.  */
1058   min_ni_width = wi::min_precision (max_ni, UNSIGNED);
1059
1060   /* Find a scalar mode for which WHILE_ULT is supported.  */
1061   opt_scalar_int_mode cmp_mode_iter;
1062   tree cmp_type = NULL_TREE;
1063   tree iv_type = NULL_TREE;
1064   widest_int iv_limit = vect_iv_limit_for_full_masking (loop_vinfo);
1065   unsigned int iv_precision = UINT_MAX;
1066
1067   if (iv_limit != -1)
1068     iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1069                                       UNSIGNED);
1070
1071   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1072     {
1073       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1074       if (cmp_bits >= min_ni_width
1075           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1076         {
1077           tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1078           if (this_type
1079               && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1080             {
1081               /* Although we could stop as soon as we find a valid mode,
1082                  there are at least two reasons why that's not always the
1083                  best choice:
1084
1085                  - An IV that's Pmode or wider is more likely to be reusable
1086                    in address calculations than an IV that's narrower than
1087                    Pmode.
1088
1089                  - Doing the comparison in IV_PRECISION or wider allows
1090                    a natural 0-based IV, whereas using a narrower comparison
1091                    type requires mitigations against wrap-around.
1092
1093                  Conversely, if the IV limit is variable, doing the comparison
1094                  in a wider type than the original type can introduce
1095                  unnecessary extensions, so picking the widest valid mode
1096                  is not always a good choice either.
1097
1098                  Here we prefer the first IV type that's Pmode or wider,
1099                  and the first comparison type that's IV_PRECISION or wider.
1100                  (The comparison type must be no wider than the IV type,
1101                  to avoid extensions in the vector loop.)
1102
1103                  ??? We might want to try continuing beyond Pmode for ILP32
1104                  targets if CMP_BITS < IV_PRECISION.  */
1105               iv_type = this_type;
1106               if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1107                 cmp_type = this_type;
1108               if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1109                 break;
1110             }
1111         }
1112     }
1113
1114   if (!cmp_type)
1115     return false;
1116
1117   LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
1118   LOOP_VINFO_MASK_IV_TYPE (loop_vinfo) = iv_type;
1119   return true;
1120 }
1121
1122 /* Calculate the cost of one scalar iteration of the loop.  */
1123 static void
1124 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1125 {
1126   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1127   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1128   int nbbs = loop->num_nodes, factor;
1129   int innerloop_iters, i;
1130
1131   DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1132
1133   /* Gather costs for statements in the scalar loop.  */
1134
1135   /* FORNOW.  */
1136   innerloop_iters = 1;
1137   if (loop->inner)
1138     innerloop_iters = 50; /* FIXME */
1139
1140   for (i = 0; i < nbbs; i++)
1141     {
1142       gimple_stmt_iterator si;
1143       basic_block bb = bbs[i];
1144
1145       if (bb->loop_father == loop->inner)
1146         factor = innerloop_iters;
1147       else
1148         factor = 1;
1149
1150       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1151         {
1152           gimple *stmt = gsi_stmt (si);
1153           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1154
1155           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1156             continue;
1157
1158           /* Skip stmts that are not vectorized inside the loop.  */
1159           stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1160           if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1161               && (!STMT_VINFO_LIVE_P (vstmt_info)
1162                   || !VECTORIZABLE_CYCLE_DEF
1163                         (STMT_VINFO_DEF_TYPE (vstmt_info))))
1164             continue;
1165
1166           vect_cost_for_stmt kind;
1167           if (STMT_VINFO_DATA_REF (stmt_info))
1168             {
1169               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1170                kind = scalar_load;
1171              else
1172                kind = scalar_store;
1173             }
1174           else
1175             kind = scalar_stmt;
1176
1177           record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1178                             factor, kind, stmt_info, 0, vect_prologue);
1179         }
1180     }
1181
1182   /* Now accumulate cost.  */
1183   void *target_cost_data = init_cost (loop);
1184   stmt_info_for_cost *si;
1185   int j;
1186   FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1187                     j, si)
1188     (void) add_stmt_cost (target_cost_data, si->count,
1189                           si->kind, si->stmt_info, si->misalign,
1190                           vect_body);
1191   unsigned dummy, body_cost = 0;
1192   finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
1193   destroy_cost_data (target_cost_data);
1194   LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
1195 }
1196
1197
1198 /* Function vect_analyze_loop_form_1.
1199
1200    Verify that certain CFG restrictions hold, including:
1201    - the loop has a pre-header
1202    - the loop has a single entry and exit
1203    - the loop exit condition is simple enough
1204    - the number of iterations can be analyzed, i.e, a countable loop.  The
1205      niter could be analyzed under some assumptions.  */
1206
1207 opt_result
1208 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1209                           tree *assumptions, tree *number_of_iterationsm1,
1210                           tree *number_of_iterations, gcond **inner_loop_cond)
1211 {
1212   DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1213
1214   /* Different restrictions apply when we are considering an inner-most loop,
1215      vs. an outer (nested) loop.
1216      (FORNOW. May want to relax some of these restrictions in the future).  */
1217
1218   if (!loop->inner)
1219     {
1220       /* Inner-most loop.  We currently require that the number of BBs is
1221          exactly 2 (the header and latch).  Vectorizable inner-most loops
1222          look like this:
1223
1224                         (pre-header)
1225                            |
1226                           header <--------+
1227                            | |            |
1228                            | +--> latch --+
1229                            |
1230                         (exit-bb)  */
1231
1232       if (loop->num_nodes != 2)
1233         return opt_result::failure_at (vect_location,
1234                                        "not vectorized:"
1235                                        " control flow in loop.\n");
1236
1237       if (empty_block_p (loop->header))
1238         return opt_result::failure_at (vect_location,
1239                                        "not vectorized: empty loop.\n");
1240     }
1241   else
1242     {
1243       struct loop *innerloop = loop->inner;
1244       edge entryedge;
1245
1246       /* Nested loop. We currently require that the loop is doubly-nested,
1247          contains a single inner loop, and the number of BBs is exactly 5.
1248          Vectorizable outer-loops look like this:
1249
1250                         (pre-header)
1251                            |
1252                           header <---+
1253                            |         |
1254                           inner-loop |
1255                            |         |
1256                           tail ------+
1257                            |
1258                         (exit-bb)
1259
1260          The inner-loop has the properties expected of inner-most loops
1261          as described above.  */
1262
1263       if ((loop->inner)->inner || (loop->inner)->next)
1264         return opt_result::failure_at (vect_location,
1265                                        "not vectorized:"
1266                                        " multiple nested loops.\n");
1267
1268       if (loop->num_nodes != 5)
1269         return opt_result::failure_at (vect_location,
1270                                        "not vectorized:"
1271                                        " control flow in loop.\n");
1272
1273       entryedge = loop_preheader_edge (innerloop);
1274       if (entryedge->src != loop->header
1275           || !single_exit (innerloop)
1276           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1277         return opt_result::failure_at (vect_location,
1278                                        "not vectorized:"
1279                                        " unsupported outerloop form.\n");
1280
1281       /* Analyze the inner-loop.  */
1282       tree inner_niterm1, inner_niter, inner_assumptions;
1283       opt_result res
1284         = vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1285                                     &inner_assumptions, &inner_niterm1,
1286                                     &inner_niter, NULL);
1287       if (!res)
1288         {
1289           if (dump_enabled_p ())
1290             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1291                              "not vectorized: Bad inner loop.\n");
1292           return res;
1293         }
1294
1295       /* Don't support analyzing niter under assumptions for inner
1296          loop.  */
1297       if (!integer_onep (inner_assumptions))
1298         return opt_result::failure_at (vect_location,
1299                                        "not vectorized: Bad inner loop.\n");
1300
1301       if (!expr_invariant_in_loop_p (loop, inner_niter))
1302         return opt_result::failure_at (vect_location,
1303                                        "not vectorized: inner-loop count not"
1304                                        " invariant.\n");
1305
1306       if (dump_enabled_p ())
1307         dump_printf_loc (MSG_NOTE, vect_location,
1308                          "Considering outer-loop vectorization.\n");
1309     }
1310
1311   if (!single_exit (loop))
1312     return opt_result::failure_at (vect_location,
1313                                    "not vectorized: multiple exits.\n");
1314   if (EDGE_COUNT (loop->header->preds) != 2)
1315     return opt_result::failure_at (vect_location,
1316                                    "not vectorized:"
1317                                    " too many incoming edges.\n");
1318
1319   /* We assume that the loop exit condition is at the end of the loop. i.e,
1320      that the loop is represented as a do-while (with a proper if-guard
1321      before the loop if needed), where the loop header contains all the
1322      executable statements, and the latch is empty.  */
1323   if (!empty_block_p (loop->latch)
1324       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1325     return opt_result::failure_at (vect_location,
1326                                    "not vectorized: latch block not empty.\n");
1327
1328   /* Make sure the exit is not abnormal.  */
1329   edge e = single_exit (loop);
1330   if (e->flags & EDGE_ABNORMAL)
1331     return opt_result::failure_at (vect_location,
1332                                    "not vectorized:"
1333                                    " abnormal loop exit edge.\n");
1334
1335   *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1336                                      number_of_iterationsm1);
1337   if (!*loop_cond)
1338     return opt_result::failure_at
1339       (vect_location,
1340        "not vectorized: complicated exit condition.\n");
1341
1342   if (integer_zerop (*assumptions)
1343       || !*number_of_iterations
1344       || chrec_contains_undetermined (*number_of_iterations))
1345     return opt_result::failure_at
1346       (*loop_cond,
1347        "not vectorized: number of iterations cannot be computed.\n");
1348
1349   if (integer_zerop (*number_of_iterations))
1350     return opt_result::failure_at
1351       (*loop_cond,
1352        "not vectorized: number of iterations = 0.\n");
1353
1354   return opt_result::success ();
1355 }
1356
1357 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form.  */
1358
1359 opt_loop_vec_info
1360 vect_analyze_loop_form (struct loop *loop, vec_info_shared *shared)
1361 {
1362   tree assumptions, number_of_iterations, number_of_iterationsm1;
1363   gcond *loop_cond, *inner_loop_cond = NULL;
1364
1365   opt_result res
1366     = vect_analyze_loop_form_1 (loop, &loop_cond,
1367                                 &assumptions, &number_of_iterationsm1,
1368                                 &number_of_iterations, &inner_loop_cond);
1369   if (!res)
1370     return opt_loop_vec_info::propagate_failure (res);
1371
1372   loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1373   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1374   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1375   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1376   if (!integer_onep (assumptions))
1377     {
1378       /* We consider to vectorize this loop by versioning it under
1379          some assumptions.  In order to do this, we need to clear
1380          existing information computed by scev and niter analyzer.  */
1381       scev_reset_htab ();
1382       free_numbers_of_iterations_estimates (loop);
1383       /* Also set flag for this loop so that following scev and niter
1384          analysis are done under the assumptions.  */
1385       loop_constraint_set (loop, LOOP_C_FINITE);
1386       /* Also record the assumptions for versioning.  */
1387       LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1388     }
1389
1390   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1391     {
1392       if (dump_enabled_p ())
1393         {
1394           dump_printf_loc (MSG_NOTE, vect_location,
1395                            "Symbolic number of iterations is ");
1396           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1397           dump_printf (MSG_NOTE, "\n");
1398         }
1399     }
1400
1401   stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (loop_cond);
1402   STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1403   if (inner_loop_cond)
1404     {
1405       stmt_vec_info inner_loop_cond_info
1406         = loop_vinfo->lookup_stmt (inner_loop_cond);
1407       STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1408     }
1409
1410   gcc_assert (!loop->aux);
1411   loop->aux = loop_vinfo;
1412   return opt_loop_vec_info::success (loop_vinfo);
1413 }
1414
1415
1416
1417 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1418    statements update the vectorization factor.  */
1419
1420 static void
1421 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1422 {
1423   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1424   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1425   int nbbs = loop->num_nodes;
1426   poly_uint64 vectorization_factor;
1427   int i;
1428
1429   DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1430
1431   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1432   gcc_assert (known_ne (vectorization_factor, 0U));
1433
1434   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1435      vectorization factor of the loop is the unrolling factor required by
1436      the SLP instances.  If that unrolling factor is 1, we say, that we
1437      perform pure SLP on loop - cross iteration parallelism is not
1438      exploited.  */
1439   bool only_slp_in_loop = true;
1440   for (i = 0; i < nbbs; i++)
1441     {
1442       basic_block bb = bbs[i];
1443       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1444            gsi_next (&si))
1445         {
1446           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1447           stmt_info = vect_stmt_to_vectorize (stmt_info);
1448           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1449                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1450               && !PURE_SLP_STMT (stmt_info))
1451             /* STMT needs both SLP and loop-based vectorization.  */
1452             only_slp_in_loop = false;
1453         }
1454     }
1455
1456   if (only_slp_in_loop)
1457     {
1458       if (dump_enabled_p ())
1459         dump_printf_loc (MSG_NOTE, vect_location,
1460                          "Loop contains only SLP stmts\n");
1461       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1462     }
1463   else
1464     {
1465       if (dump_enabled_p ())
1466         dump_printf_loc (MSG_NOTE, vect_location,
1467                          "Loop contains SLP and non-SLP stmts\n");
1468       /* Both the vectorization factor and unroll factor have the form
1469          current_vector_size * X for some rational X, so they must have
1470          a common multiple.  */
1471       vectorization_factor
1472         = force_common_multiple (vectorization_factor,
1473                                  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1474     }
1475
1476   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1477   if (dump_enabled_p ())
1478     {
1479       dump_printf_loc (MSG_NOTE, vect_location,
1480                        "Updating vectorization factor to ");
1481       dump_dec (MSG_NOTE, vectorization_factor);
1482       dump_printf (MSG_NOTE, ".\n");
1483     }
1484 }
1485
1486 /* Return true if STMT_INFO describes a double reduction phi and if
1487    the other phi in the reduction is also relevant for vectorization.
1488    This rejects cases such as:
1489
1490       outer1:
1491         x_1 = PHI <x_3(outer2), ...>;
1492         ...
1493
1494       inner:
1495         x_2 = ...;
1496         ...
1497
1498       outer2:
1499         x_3 = PHI <x_2(inner)>;
1500
1501    if nothing in x_2 or elsewhere makes x_1 relevant.  */
1502
1503 static bool
1504 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1505 {
1506   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1507     return false;
1508
1509   return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1510 }
1511
1512 /* Function vect_analyze_loop_operations.
1513
1514    Scan the loop stmts and make sure they are all vectorizable.  */
1515
1516 static opt_result
1517 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1518 {
1519   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1520   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1521   int nbbs = loop->num_nodes;
1522   int i;
1523   stmt_vec_info stmt_info;
1524   bool need_to_vectorize = false;
1525   bool ok;
1526
1527   DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1528
1529   auto_vec<stmt_info_for_cost> cost_vec;
1530
1531   for (i = 0; i < nbbs; i++)
1532     {
1533       basic_block bb = bbs[i];
1534
1535       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1536            gsi_next (&si))
1537         {
1538           gphi *phi = si.phi ();
1539           ok = true;
1540
1541           stmt_info = loop_vinfo->lookup_stmt (phi);
1542           if (dump_enabled_p ())
1543             dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G", phi);
1544           if (virtual_operand_p (gimple_phi_result (phi)))
1545             continue;
1546
1547           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1548              (i.e., a phi in the tail of the outer-loop).  */
1549           if (! is_loop_header_bb_p (bb))
1550             {
1551               /* FORNOW: we currently don't support the case that these phis
1552                  are not used in the outerloop (unless it is double reduction,
1553                  i.e., this phi is vect_reduction_def), cause this case
1554                  requires to actually do something here.  */
1555               if (STMT_VINFO_LIVE_P (stmt_info)
1556                   && !vect_active_double_reduction_p (stmt_info))
1557                 return opt_result::failure_at (phi,
1558                                                "Unsupported loop-closed phi"
1559                                                " in outer-loop.\n");
1560
1561               /* If PHI is used in the outer loop, we check that its operand
1562                  is defined in the inner loop.  */
1563               if (STMT_VINFO_RELEVANT_P (stmt_info))
1564                 {
1565                   tree phi_op;
1566
1567                   if (gimple_phi_num_args (phi) != 1)
1568                     return opt_result::failure_at (phi, "unsupported phi");
1569
1570                   phi_op = PHI_ARG_DEF (phi, 0);
1571                   stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1572                   if (!op_def_info)
1573                     return opt_result::failure_at (phi, "unsupported phi");
1574
1575                   if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1576                       && (STMT_VINFO_RELEVANT (op_def_info)
1577                           != vect_used_in_outer_by_reduction))
1578                     return opt_result::failure_at (phi, "unsupported phi");
1579                 }
1580
1581               continue;
1582             }
1583
1584           gcc_assert (stmt_info);
1585
1586           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1587                || STMT_VINFO_LIVE_P (stmt_info))
1588               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1589             /* A scalar-dependence cycle that we don't support.  */
1590             return opt_result::failure_at (phi,
1591                                            "not vectorized:"
1592                                            " scalar dependence cycle.\n");
1593
1594           if (STMT_VINFO_RELEVANT_P (stmt_info))
1595             {
1596               need_to_vectorize = true;
1597               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1598                   && ! PURE_SLP_STMT (stmt_info))
1599                 ok = vectorizable_induction (stmt_info, NULL, NULL, NULL,
1600                                              &cost_vec);
1601               else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1602                         || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1603                        && ! PURE_SLP_STMT (stmt_info))
1604                 ok = vectorizable_reduction (stmt_info, NULL, NULL, NULL, NULL,
1605                                              &cost_vec);
1606             }
1607
1608           /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
1609           if (ok
1610               && STMT_VINFO_LIVE_P (stmt_info)
1611               && !PURE_SLP_STMT (stmt_info))
1612             ok = vectorizable_live_operation (stmt_info, NULL, NULL, -1, NULL,
1613                                               &cost_vec);
1614
1615           if (!ok)
1616             return opt_result::failure_at (phi,
1617                                            "not vectorized: relevant phi not "
1618                                            "supported: %G",
1619                                            static_cast <gimple *> (phi));
1620         }
1621
1622       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1623            gsi_next (&si))
1624         {
1625           gimple *stmt = gsi_stmt (si);
1626           if (!gimple_clobber_p (stmt))
1627             {
1628               opt_result res
1629                 = vect_analyze_stmt (loop_vinfo->lookup_stmt (stmt),
1630                                      &need_to_vectorize,
1631                                      NULL, NULL, &cost_vec);
1632               if (!res)
1633                 return res;
1634             }
1635         }
1636     } /* bbs */
1637
1638   add_stmt_costs (loop_vinfo->target_cost_data, &cost_vec);
1639
1640   /* All operations in the loop are either irrelevant (deal with loop
1641      control, or dead), or only used outside the loop and can be moved
1642      out of the loop (e.g. invariants, inductions).  The loop can be
1643      optimized away by scalar optimizations.  We're better off not
1644      touching this loop.  */
1645   if (!need_to_vectorize)
1646     {
1647       if (dump_enabled_p ())
1648         dump_printf_loc (MSG_NOTE, vect_location,
1649                          "All the computation can be taken out of the loop.\n");
1650       return opt_result::failure_at
1651         (vect_location,
1652          "not vectorized: redundant loop. no profit to vectorize.\n");
1653     }
1654
1655   return opt_result::success ();
1656 }
1657
1658 /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
1659    is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
1660    definitely no, or -1 if it's worth retrying.  */
1661
1662 static int
1663 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1664 {
1665   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1666   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1667
1668   /* Only fully-masked loops can have iteration counts less than the
1669      vectorization factor.  */
1670   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1671     {
1672       HOST_WIDE_INT max_niter;
1673
1674       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1675         max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1676       else
1677         max_niter = max_stmt_executions_int (loop);
1678
1679       if (max_niter != -1
1680           && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1681         {
1682           if (dump_enabled_p ())
1683             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1684                              "not vectorized: iteration count smaller than "
1685                              "vectorization factor.\n");
1686           return 0;
1687         }
1688     }
1689
1690   int min_profitable_iters, min_profitable_estimate;
1691   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1692                                       &min_profitable_estimate);
1693
1694   if (min_profitable_iters < 0)
1695     {
1696       if (dump_enabled_p ())
1697         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1698                          "not vectorized: vectorization not profitable.\n");
1699       if (dump_enabled_p ())
1700         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1701                          "not vectorized: vector version will never be "
1702                          "profitable.\n");
1703       return -1;
1704     }
1705
1706   int min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1707                                * assumed_vf);
1708
1709   /* Use the cost model only if it is more conservative than user specified
1710      threshold.  */
1711   unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1712                                     min_profitable_iters);
1713
1714   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1715
1716   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1717       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1718     {
1719       if (dump_enabled_p ())
1720         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1721                          "not vectorized: vectorization not profitable.\n");
1722       if (dump_enabled_p ())
1723         dump_printf_loc (MSG_NOTE, vect_location,
1724                          "not vectorized: iteration count smaller than user "
1725                          "specified loop bound parameter or minimum profitable "
1726                          "iterations (whichever is more conservative).\n");
1727       return 0;
1728     }
1729
1730   HOST_WIDE_INT estimated_niter = estimated_stmt_executions_int (loop);
1731   if (estimated_niter == -1)
1732     estimated_niter = likely_max_stmt_executions_int (loop);
1733   if (estimated_niter != -1
1734       && ((unsigned HOST_WIDE_INT) estimated_niter
1735           < MAX (th, (unsigned) min_profitable_estimate)))
1736     {
1737       if (dump_enabled_p ())
1738         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1739                          "not vectorized: estimated iteration count too "
1740                          "small.\n");
1741       if (dump_enabled_p ())
1742         dump_printf_loc (MSG_NOTE, vect_location,
1743                          "not vectorized: estimated iteration count smaller "
1744                          "than specified loop bound parameter or minimum "
1745                          "profitable iterations (whichever is more "
1746                          "conservative).\n");
1747       return -1;
1748     }
1749
1750   return 1;
1751 }
1752
1753 static opt_result
1754 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1755                            vec<data_reference_p> *datarefs,
1756                            unsigned int *n_stmts)
1757 {
1758   *n_stmts = 0;
1759   for (unsigned i = 0; i < loop->num_nodes; i++)
1760     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1761          !gsi_end_p (gsi); gsi_next (&gsi))
1762       {
1763         gimple *stmt = gsi_stmt (gsi);
1764         if (is_gimple_debug (stmt))
1765           continue;
1766         ++(*n_stmts);
1767         opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs);
1768         if (!res)
1769           {
1770             if (is_gimple_call (stmt) && loop->safelen)
1771               {
1772                 tree fndecl = gimple_call_fndecl (stmt), op;
1773                 if (fndecl != NULL_TREE)
1774                   {
1775                     cgraph_node *node = cgraph_node::get (fndecl);
1776                     if (node != NULL && node->simd_clones != NULL)
1777                       {
1778                         unsigned int j, n = gimple_call_num_args (stmt);
1779                         for (j = 0; j < n; j++)
1780                           {
1781                             op = gimple_call_arg (stmt, j);
1782                             if (DECL_P (op)
1783                                 || (REFERENCE_CLASS_P (op)
1784                                     && get_base_address (op)))
1785                               break;
1786                           }
1787                         op = gimple_call_lhs (stmt);
1788                         /* Ignore #pragma omp declare simd functions
1789                            if they don't have data references in the
1790                            call stmt itself.  */
1791                         if (j == n
1792                             && !(op
1793                                  && (DECL_P (op)
1794                                      || (REFERENCE_CLASS_P (op)
1795                                          && get_base_address (op)))))
1796                           continue;
1797                       }
1798                   }
1799               }
1800             return res;
1801           }
1802         /* If dependence analysis will give up due to the limit on the
1803            number of datarefs stop here and fail fatally.  */
1804         if (datarefs->length ()
1805             > (unsigned)PARAM_VALUE (PARAM_LOOP_MAX_DATAREFS_FOR_DATADEPS))
1806           return opt_result::failure_at (stmt, "exceeded param "
1807                                          "loop-max-datarefs-for-datadeps\n");
1808       }
1809   return opt_result::success ();
1810 }
1811
1812 /* Look for SLP-only access groups and turn each individual access into its own
1813    group.  */
1814 static void
1815 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
1816 {
1817   unsigned int i;
1818   struct data_reference *dr;
1819
1820   DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
1821
1822   vec<data_reference_p> datarefs = loop_vinfo->shared->datarefs;
1823   FOR_EACH_VEC_ELT (datarefs, i, dr)
1824     {
1825       gcc_assert (DR_REF (dr));
1826       stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
1827
1828       /* Check if the load is a part of an interleaving chain.  */
1829       if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1830         {
1831           stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
1832           unsigned int group_size = DR_GROUP_SIZE (first_element);
1833
1834           /* Check if SLP-only groups.  */
1835           if (!STMT_SLP_TYPE (stmt_info)
1836               && STMT_VINFO_SLP_VECT_ONLY (first_element))
1837             {
1838               /* Dissolve the group.  */
1839               STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
1840
1841               stmt_vec_info vinfo = first_element;
1842               while (vinfo)
1843                 {
1844                   stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
1845                   DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
1846                   DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
1847                   DR_GROUP_SIZE (vinfo) = 1;
1848                   DR_GROUP_GAP (vinfo) = group_size - 1;
1849                   vinfo = next;
1850                 }
1851             }
1852         }
1853     }
1854 }
1855
1856 /* Function vect_analyze_loop_2.
1857
1858    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1859    for it.  The different analyses will record information in the
1860    loop_vec_info struct.  */
1861 static opt_result
1862 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
1863 {
1864   opt_result ok = opt_result::success ();
1865   int res;
1866   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
1867   poly_uint64 min_vf = 2;
1868
1869   /* The first group of checks is independent of the vector size.  */
1870   fatal = true;
1871
1872   if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
1873       && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
1874     return opt_result::failure_at (vect_location,
1875                                    "not vectorized: simd if(0)\n");
1876
1877   /* Find all data references in the loop (which correspond to vdefs/vuses)
1878      and analyze their evolution in the loop.  */
1879
1880   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1881
1882   /* Gather the data references and count stmts in the loop.  */
1883   if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
1884     {
1885       opt_result res
1886         = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
1887                                      &LOOP_VINFO_DATAREFS (loop_vinfo),
1888                                      n_stmts);
1889       if (!res)
1890         {
1891           if (dump_enabled_p ())
1892             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1893                              "not vectorized: loop contains function "
1894                              "calls or data references that cannot "
1895                              "be analyzed\n");
1896           return res;
1897         }
1898       loop_vinfo->shared->save_datarefs ();
1899     }
1900   else
1901     loop_vinfo->shared->check_datarefs ();
1902
1903   /* Analyze the data references and also adjust the minimal
1904      vectorization factor according to the loads and stores.  */
1905
1906   ok = vect_analyze_data_refs (loop_vinfo, &min_vf);
1907   if (!ok)
1908     {
1909       if (dump_enabled_p ())
1910         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1911                          "bad data references.\n");
1912       return ok;
1913     }
1914
1915   /* Classify all cross-iteration scalar data-flow cycles.
1916      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1917   vect_analyze_scalar_cycles (loop_vinfo);
1918
1919   vect_pattern_recog (loop_vinfo);
1920
1921   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1922
1923   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1924      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1925
1926   ok = vect_analyze_data_ref_accesses (loop_vinfo);
1927   if (!ok)
1928     {
1929       if (dump_enabled_p ())
1930         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1931                          "bad data access.\n");
1932       return ok;
1933     }
1934
1935   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1936
1937   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1938   if (!ok)
1939     {
1940       if (dump_enabled_p ())
1941         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1942                          "unexpected pattern.\n");
1943       return ok;
1944     }
1945
1946   /* While the rest of the analysis below depends on it in some way.  */
1947   fatal = false;
1948
1949   /* Analyze data dependences between the data-refs in the loop
1950      and adjust the maximum vectorization factor according to
1951      the dependences.
1952      FORNOW: fail at the first data dependence that we encounter.  */
1953
1954   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1955   if (!ok)
1956     {
1957       if (dump_enabled_p ())
1958         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1959                          "bad data dependence.\n");
1960       return ok;
1961     }
1962   if (max_vf != MAX_VECTORIZATION_FACTOR
1963       && maybe_lt (max_vf, min_vf))
1964     return opt_result::failure_at (vect_location, "bad data dependence.\n");
1965   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
1966
1967   ok = vect_determine_vectorization_factor (loop_vinfo);
1968   if (!ok)
1969     {
1970       if (dump_enabled_p ())
1971         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1972                          "can't determine vectorization factor.\n");
1973       return ok;
1974     }
1975   if (max_vf != MAX_VECTORIZATION_FACTOR
1976       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1977     return opt_result::failure_at (vect_location, "bad data dependence.\n");
1978
1979   /* Compute the scalar iteration cost.  */
1980   vect_compute_single_scalar_iteration_cost (loop_vinfo);
1981
1982   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1983   unsigned th;
1984
1985   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
1986   ok = vect_analyze_slp (loop_vinfo, *n_stmts);
1987   if (!ok)
1988     return ok;
1989
1990   /* If there are any SLP instances mark them as pure_slp.  */
1991   bool slp = vect_make_slp_decision (loop_vinfo);
1992   if (slp)
1993     {
1994       /* Find stmts that need to be both vectorized and SLPed.  */
1995       vect_detect_hybrid_slp (loop_vinfo);
1996
1997       /* Update the vectorization factor based on the SLP decision.  */
1998       vect_update_vf_for_slp (loop_vinfo);
1999     }
2000
2001   bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo);
2002
2003   /* We don't expect to have to roll back to anything other than an empty
2004      set of rgroups.  */
2005   gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2006
2007   /* This is the point where we can re-start analysis with SLP forced off.  */
2008 start_over:
2009
2010   /* Now the vectorization factor is final.  */
2011   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2012   gcc_assert (known_ne (vectorization_factor, 0U));
2013
2014   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2015     {
2016       dump_printf_loc (MSG_NOTE, vect_location,
2017                        "vectorization_factor = ");
2018       dump_dec (MSG_NOTE, vectorization_factor);
2019       dump_printf (MSG_NOTE, ", niters = %wd\n",
2020                    LOOP_VINFO_INT_NITERS (loop_vinfo));
2021     }
2022
2023   HOST_WIDE_INT max_niter
2024     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2025
2026   /* Analyze the alignment of the data-refs in the loop.
2027      Fail if a data reference is found that cannot be vectorized.  */
2028
2029   ok = vect_analyze_data_refs_alignment (loop_vinfo);
2030   if (!ok)
2031     {
2032       if (dump_enabled_p ())
2033         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2034                          "bad data alignment.\n");
2035       return ok;
2036     }
2037
2038   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2039      It is important to call pruning after vect_analyze_data_ref_accesses,
2040      since we use grouping information gathered by interleaving analysis.  */
2041   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2042   if (!ok)
2043     return ok;
2044
2045   /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2046      vectorization, since we do not want to add extra peeling or
2047      add versioning for alignment.  */
2048   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2049     /* This pass will decide on using loop versioning and/or loop peeling in
2050        order to enhance the alignment of data references in the loop.  */
2051     ok = vect_enhance_data_refs_alignment (loop_vinfo);
2052   else
2053     ok = vect_verify_datarefs_alignment (loop_vinfo);
2054   if (!ok)
2055     return ok;
2056
2057   if (slp)
2058     {
2059       /* Analyze operations in the SLP instances.  Note this may
2060          remove unsupported SLP instances which makes the above
2061          SLP kind detection invalid.  */
2062       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2063       vect_slp_analyze_operations (loop_vinfo);
2064       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2065         {
2066           ok = opt_result::failure_at (vect_location,
2067                                        "unsupported SLP instances\n");
2068           goto again;
2069         }
2070     }
2071
2072   /* Dissolve SLP-only groups.  */
2073   vect_dissolve_slp_only_groups (loop_vinfo);
2074
2075   /* Scan all the remaining operations in the loop that are not subject
2076      to SLP and make sure they are vectorizable.  */
2077   ok = vect_analyze_loop_operations (loop_vinfo);
2078   if (!ok)
2079     {
2080       if (dump_enabled_p ())
2081         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2082                          "bad operation or unsupported loop bound.\n");
2083       return ok;
2084     }
2085
2086   /* Decide whether to use a fully-masked loop for this vectorization
2087      factor.  */
2088   LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2089     = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
2090        && vect_verify_full_masking (loop_vinfo));
2091   if (dump_enabled_p ())
2092     {
2093       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2094         dump_printf_loc (MSG_NOTE, vect_location,
2095                          "using a fully-masked loop.\n");
2096       else
2097         dump_printf_loc (MSG_NOTE, vect_location,
2098                          "not using a fully-masked loop.\n");
2099     }
2100
2101   /* If epilog loop is required because of data accesses with gaps,
2102      one additional iteration needs to be peeled.  Check if there is
2103      enough iterations for vectorization.  */
2104   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2105       && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2106       && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2107     {
2108       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2109       tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2110
2111       if (known_lt (wi::to_widest (scalar_niters), vf))
2112         return opt_result::failure_at (vect_location,
2113                                        "loop has no enough iterations to"
2114                                        " support peeling for gaps.\n");
2115     }
2116
2117   /* Check the costings of the loop make vectorizing worthwhile.  */
2118   res = vect_analyze_loop_costing (loop_vinfo);
2119   if (res < 0)
2120     {
2121       ok = opt_result::failure_at (vect_location,
2122                                    "Loop costings may not be worthwhile.\n");
2123       goto again;
2124     }
2125   if (!res)
2126     return opt_result::failure_at (vect_location,
2127                                    "Loop costings not worthwhile.\n");
2128
2129   /* Decide whether we need to create an epilogue loop to handle
2130      remaining scalar iterations.  */
2131   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2132
2133   unsigned HOST_WIDE_INT const_vf;
2134   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2135     /* The main loop handles all iterations.  */
2136     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2137   else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2138            && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
2139     {
2140       /* Work out the (constant) number of iterations that need to be
2141          peeled for reasons other than niters.  */
2142       unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2143       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2144         peel_niter += 1;
2145       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
2146                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2147         LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2148     }
2149   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2150            /* ??? When peeling for gaps but not alignment, we could
2151               try to check whether the (variable) niters is known to be
2152               VF * N + 1.  That's something of a niche case though.  */
2153            || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2154            || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
2155            || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2156                 < (unsigned) exact_log2 (const_vf))
2157                /* In case of versioning, check if the maximum number of
2158                   iterations is greater than th.  If they are identical,
2159                   the epilogue is unnecessary.  */
2160                && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2161                    || ((unsigned HOST_WIDE_INT) max_niter
2162                        > (th / const_vf) * const_vf))))
2163     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2164
2165   /* If an epilogue loop is required make sure we can create one.  */
2166   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2167       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2168     {
2169       if (dump_enabled_p ())
2170         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2171       if (!vect_can_advance_ivs_p (loop_vinfo)
2172           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2173                                            single_exit (LOOP_VINFO_LOOP
2174                                                          (loop_vinfo))))
2175         {
2176           ok = opt_result::failure_at (vect_location,
2177                                        "not vectorized: can't create required "
2178                                        "epilog loop\n");
2179           goto again;
2180         }
2181     }
2182
2183   /* During peeling, we need to check if number of loop iterations is
2184      enough for both peeled prolog loop and vector loop.  This check
2185      can be merged along with threshold check of loop versioning, so
2186      increase threshold for this case if necessary.  */
2187   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
2188     {
2189       poly_uint64 niters_th = 0;
2190
2191       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2192         {
2193           /* Niters for peeled prolog loop.  */
2194           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2195             {
2196               dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2197               tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2198               niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2199             }
2200           else
2201             niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2202         }
2203
2204       /* Niters for at least one iteration of vectorized loop.  */
2205       if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2206         niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2207       /* One additional iteration because of peeling for gap.  */
2208       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2209         niters_th += 1;
2210       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2211     }
2212
2213   gcc_assert (known_eq (vectorization_factor,
2214                         LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2215
2216   /* Ok to vectorize!  */
2217   return opt_result::success ();
2218
2219 again:
2220   /* Ensure that "ok" is false (with an opt_problem if dumping is enabled).  */
2221   gcc_assert (!ok);
2222
2223   /* Try again with SLP forced off but if we didn't do any SLP there is
2224      no point in re-trying.  */
2225   if (!slp)
2226     return ok;
2227
2228   /* If there are reduction chains re-trying will fail anyway.  */
2229   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2230     return ok;
2231
2232   /* Likewise if the grouped loads or stores in the SLP cannot be handled
2233      via interleaving or lane instructions.  */
2234   slp_instance instance;
2235   slp_tree node;
2236   unsigned i, j;
2237   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2238     {
2239       stmt_vec_info vinfo;
2240       vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2241       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2242         continue;
2243       vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2244       unsigned int size = DR_GROUP_SIZE (vinfo);
2245       tree vectype = STMT_VINFO_VECTYPE (vinfo);
2246       if (! vect_store_lanes_supported (vectype, size, false)
2247          && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2248          && ! vect_grouped_store_supported (vectype, size))
2249         return opt_result::failure_at (vinfo->stmt,
2250                                        "unsupported grouped store\n");
2251       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2252         {
2253           vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
2254           vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2255           bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2256           size = DR_GROUP_SIZE (vinfo);
2257           vectype = STMT_VINFO_VECTYPE (vinfo);
2258           if (! vect_load_lanes_supported (vectype, size, false)
2259               && ! vect_grouped_load_supported (vectype, single_element_p,
2260                                                 size))
2261             return opt_result::failure_at (vinfo->stmt,
2262                                            "unsupported grouped load\n");
2263         }
2264     }
2265
2266   if (dump_enabled_p ())
2267     dump_printf_loc (MSG_NOTE, vect_location,
2268                      "re-trying with SLP disabled\n");
2269
2270   /* Roll back state appropriately.  No SLP this time.  */
2271   slp = false;
2272   /* Restore vectorization factor as it were without SLP.  */
2273   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2274   /* Free the SLP instances.  */
2275   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2276     vect_free_slp_instance (instance, false);
2277   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2278   /* Reset SLP type to loop_vect on all stmts.  */
2279   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2280     {
2281       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2282       for (gimple_stmt_iterator si = gsi_start_phis (bb);
2283            !gsi_end_p (si); gsi_next (&si))
2284         {
2285           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2286           STMT_SLP_TYPE (stmt_info) = loop_vect;
2287         }
2288       for (gimple_stmt_iterator si = gsi_start_bb (bb);
2289            !gsi_end_p (si); gsi_next (&si))
2290         {
2291           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2292           STMT_SLP_TYPE (stmt_info) = loop_vect;
2293           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2294             {
2295               gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2296               stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
2297               STMT_SLP_TYPE (stmt_info) = loop_vect;
2298               for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2299                    !gsi_end_p (pi); gsi_next (&pi))
2300                 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2301                   = loop_vect;
2302             }
2303         }
2304     }
2305   /* Free optimized alias test DDRS.  */
2306   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2307   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2308   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2309   /* Reset target cost data.  */
2310   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2311   LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2312     = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2313   /* Reset accumulated rgroup information.  */
2314   release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo));
2315   /* Reset assorted flags.  */
2316   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2317   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2318   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2319   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2320   LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p;
2321
2322   goto start_over;
2323 }
2324
2325 /* Function vect_analyze_loop.
2326
2327    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2328    for it.  The different analyses will record information in the
2329    loop_vec_info struct.  If ORIG_LOOP_VINFO is not NULL epilogue must
2330    be vectorized.  */
2331 opt_loop_vec_info
2332 vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo,
2333                    vec_info_shared *shared)
2334 {
2335   auto_vector_sizes vector_sizes;
2336
2337   /* Autodetect first vector size we try.  */
2338   current_vector_size = 0;
2339   targetm.vectorize.autovectorize_vector_sizes (&vector_sizes,
2340                                                 loop->simdlen != 0);
2341   unsigned int next_size = 0;
2342
2343   DUMP_VECT_SCOPE ("analyze_loop_nest");
2344
2345   if (loop_outer (loop)
2346       && loop_vec_info_for_loop (loop_outer (loop))
2347       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2348     return opt_loop_vec_info::failure_at (vect_location,
2349                                           "outer-loop already vectorized.\n");
2350
2351   if (!find_loop_nest (loop, &shared->loop_nest))
2352     return opt_loop_vec_info::failure_at
2353       (vect_location,
2354        "not vectorized: loop nest containing two or more consecutive inner"
2355        " loops cannot be vectorized\n");
2356
2357   unsigned n_stmts = 0;
2358   poly_uint64 autodetected_vector_size = 0;
2359   opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
2360   poly_uint64 first_vector_size = 0;
2361   while (1)
2362     {
2363       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
2364       opt_loop_vec_info loop_vinfo
2365         = vect_analyze_loop_form (loop, shared);
2366       if (!loop_vinfo)
2367         {
2368           if (dump_enabled_p ())
2369             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2370                              "bad loop form.\n");
2371           gcc_checking_assert (first_loop_vinfo == NULL);
2372           return loop_vinfo;
2373         }
2374
2375       bool fatal = false;
2376
2377       if (orig_loop_vinfo)
2378         LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo;
2379
2380       opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts);
2381       if (res)
2382         {
2383           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2384
2385           if (loop->simdlen
2386               && maybe_ne (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2387                            (unsigned HOST_WIDE_INT) loop->simdlen))
2388             {
2389               if (first_loop_vinfo == NULL)
2390                 {
2391                   first_loop_vinfo = loop_vinfo;
2392                   first_vector_size = current_vector_size;
2393                   loop->aux = NULL;
2394                 }
2395               else
2396                 delete loop_vinfo;
2397             }
2398           else
2399             {
2400               delete first_loop_vinfo;
2401               return loop_vinfo;
2402             }
2403         }
2404       else
2405         delete loop_vinfo;
2406
2407       if (next_size == 0)
2408         autodetected_vector_size = current_vector_size;
2409
2410       if (next_size < vector_sizes.length ()
2411           && known_eq (vector_sizes[next_size], autodetected_vector_size))
2412         next_size += 1;
2413
2414       if (fatal)
2415         {
2416           gcc_checking_assert (first_loop_vinfo == NULL);
2417           return opt_loop_vec_info::propagate_failure (res);
2418         }
2419
2420       if (next_size == vector_sizes.length ()
2421           || known_eq (current_vector_size, 0U))
2422         {
2423           if (first_loop_vinfo)
2424             {
2425               current_vector_size = first_vector_size;
2426               loop->aux = (loop_vec_info) first_loop_vinfo;
2427               if (dump_enabled_p ())
2428                 {
2429                   dump_printf_loc (MSG_NOTE, vect_location,
2430                                    "***** Choosing vector size ");
2431                   dump_dec (MSG_NOTE, current_vector_size);
2432                   dump_printf (MSG_NOTE, "\n");
2433                 }
2434               return first_loop_vinfo;
2435             }
2436           else
2437             return opt_loop_vec_info::propagate_failure (res);
2438         }
2439
2440       /* Try the next biggest vector size.  */
2441       current_vector_size = vector_sizes[next_size++];
2442       if (dump_enabled_p ())
2443         {
2444           dump_printf_loc (MSG_NOTE, vect_location,
2445                            "***** Re-trying analysis with "
2446                            "vector size ");
2447           dump_dec (MSG_NOTE, current_vector_size);
2448           dump_printf (MSG_NOTE, "\n");
2449         }
2450     }
2451 }
2452
2453 /* Return true if there is an in-order reduction function for CODE, storing
2454    it in *REDUC_FN if so.  */
2455
2456 static bool
2457 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
2458 {
2459   switch (code)
2460     {
2461     case PLUS_EXPR:
2462       *reduc_fn = IFN_FOLD_LEFT_PLUS;
2463       return true;
2464
2465     default:
2466       return false;
2467     }
2468 }
2469
2470 /* Function reduction_fn_for_scalar_code
2471
2472    Input:
2473    CODE - tree_code of a reduction operations.
2474
2475    Output:
2476    REDUC_FN - the corresponding internal function to be used to reduce the
2477       vector of partial results into a single scalar result, or IFN_LAST
2478       if the operation is a supported reduction operation, but does not have
2479       such an internal function.
2480
2481    Return FALSE if CODE currently cannot be vectorized as reduction.  */
2482
2483 static bool
2484 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
2485 {
2486   switch (code)
2487     {
2488       case MAX_EXPR:
2489         *reduc_fn = IFN_REDUC_MAX;
2490         return true;
2491
2492       case MIN_EXPR:
2493         *reduc_fn = IFN_REDUC_MIN;
2494         return true;
2495
2496       case PLUS_EXPR:
2497         *reduc_fn = IFN_REDUC_PLUS;
2498         return true;
2499
2500       case BIT_AND_EXPR:
2501         *reduc_fn = IFN_REDUC_AND;
2502         return true;
2503
2504       case BIT_IOR_EXPR:
2505         *reduc_fn = IFN_REDUC_IOR;
2506         return true;
2507
2508       case BIT_XOR_EXPR:
2509         *reduc_fn = IFN_REDUC_XOR;
2510         return true;
2511
2512       case MULT_EXPR:
2513       case MINUS_EXPR:
2514         *reduc_fn = IFN_LAST;
2515         return true;
2516
2517       default:
2518        return false;
2519     }
2520 }
2521
2522 /* If there is a neutral value X such that SLP reduction NODE would not
2523    be affected by the introduction of additional X elements, return that X,
2524    otherwise return null.  CODE is the code of the reduction.  REDUC_CHAIN
2525    is true if the SLP statements perform a single reduction, false if each
2526    statement performs an independent reduction.  */
2527
2528 static tree
2529 neutral_op_for_slp_reduction (slp_tree slp_node, tree_code code,
2530                               bool reduc_chain)
2531 {
2532   vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2533   stmt_vec_info stmt_vinfo = stmts[0];
2534   tree vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
2535   tree scalar_type = TREE_TYPE (vector_type);
2536   struct loop *loop = gimple_bb (stmt_vinfo->stmt)->loop_father;
2537   gcc_assert (loop);
2538
2539   switch (code)
2540     {
2541     case WIDEN_SUM_EXPR:
2542     case DOT_PROD_EXPR:
2543     case SAD_EXPR:
2544     case PLUS_EXPR:
2545     case MINUS_EXPR:
2546     case BIT_IOR_EXPR:
2547     case BIT_XOR_EXPR:
2548       return build_zero_cst (scalar_type);
2549
2550     case MULT_EXPR:
2551       return build_one_cst (scalar_type);
2552
2553     case BIT_AND_EXPR:
2554       return build_all_ones_cst (scalar_type);
2555
2556     case MAX_EXPR:
2557     case MIN_EXPR:
2558       /* For MIN/MAX the initial values are neutral.  A reduction chain
2559          has only a single initial value, so that value is neutral for
2560          all statements.  */
2561       if (reduc_chain)
2562         return PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt,
2563                                       loop_preheader_edge (loop));
2564       return NULL_TREE;
2565
2566     default:
2567       return NULL_TREE;
2568     }
2569 }
2570
2571 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
2572    STMT is printed with a message MSG. */
2573
2574 static void
2575 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2576 {
2577   dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
2578 }
2579
2580 /* DEF_STMT_INFO occurs in a loop that contains a potential reduction
2581    operation.  Return true if the results of DEF_STMT_INFO are something
2582    that can be accumulated by such a reduction.  */
2583
2584 static bool
2585 vect_valid_reduction_input_p (stmt_vec_info def_stmt_info)
2586 {
2587   return (is_gimple_assign (def_stmt_info->stmt)
2588           || is_gimple_call (def_stmt_info->stmt)
2589           || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_induction_def
2590           || (gimple_code (def_stmt_info->stmt) == GIMPLE_PHI
2591               && STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_internal_def
2592               && !is_loop_header_bb_p (gimple_bb (def_stmt_info->stmt))));
2593 }
2594
2595 /* Detect SLP reduction of the form:
2596
2597    #a1 = phi <a5, a0>
2598    a2 = operation (a1)
2599    a3 = operation (a2)
2600    a4 = operation (a3)
2601    a5 = operation (a4)
2602
2603    #a = phi <a5>
2604
2605    PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2606    FIRST_STMT is the first reduction stmt in the chain
2607    (a2 = operation (a1)).
2608
2609    Return TRUE if a reduction chain was detected.  */
2610
2611 static bool
2612 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2613                        gimple *first_stmt)
2614 {
2615   struct loop *loop = (gimple_bb (phi))->loop_father;
2616   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2617   enum tree_code code;
2618   gimple *loop_use_stmt = NULL;
2619   stmt_vec_info use_stmt_info;
2620   tree lhs;
2621   imm_use_iterator imm_iter;
2622   use_operand_p use_p;
2623   int nloop_uses, size = 0, n_out_of_loop_uses;
2624   bool found = false;
2625
2626   if (loop != vect_loop)
2627     return false;
2628
2629   auto_vec<stmt_vec_info, 8> reduc_chain;
2630   lhs = PHI_RESULT (phi);
2631   code = gimple_assign_rhs_code (first_stmt);
2632   while (1)
2633     {
2634       nloop_uses = 0;
2635       n_out_of_loop_uses = 0;
2636       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2637         {
2638           gimple *use_stmt = USE_STMT (use_p);
2639           if (is_gimple_debug (use_stmt))
2640             continue;
2641
2642           /* Check if we got back to the reduction phi.  */
2643           if (use_stmt == phi)
2644             {
2645               loop_use_stmt = use_stmt;
2646               found = true;
2647               break;
2648             }
2649
2650           if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2651             {
2652               loop_use_stmt = use_stmt;
2653               nloop_uses++;
2654             }
2655            else
2656              n_out_of_loop_uses++;
2657
2658            /* There are can be either a single use in the loop or two uses in
2659               phi nodes.  */
2660            if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2661              return false;
2662         }
2663
2664       if (found)
2665         break;
2666
2667       /* We reached a statement with no loop uses.  */
2668       if (nloop_uses == 0)
2669         return false;
2670
2671       /* This is a loop exit phi, and we haven't reached the reduction phi.  */
2672       if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2673         return false;
2674
2675       if (!is_gimple_assign (loop_use_stmt)
2676           || code != gimple_assign_rhs_code (loop_use_stmt)
2677           || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2678         return false;
2679
2680       /* Insert USE_STMT into reduction chain.  */
2681       use_stmt_info = loop_info->lookup_stmt (loop_use_stmt);
2682       reduc_chain.safe_push (use_stmt_info);
2683
2684       lhs = gimple_assign_lhs (loop_use_stmt);
2685       size++;
2686    }
2687
2688   if (!found || loop_use_stmt != phi || size < 2)
2689     return false;
2690
2691   /* Swap the operands, if needed, to make the reduction operand be the second
2692      operand.  */
2693   lhs = PHI_RESULT (phi);
2694   for (unsigned i = 0; i < reduc_chain.length (); ++i)
2695     {
2696       gassign *next_stmt = as_a <gassign *> (reduc_chain[i]->stmt);
2697       if (gimple_assign_rhs2 (next_stmt) == lhs)
2698         {
2699           tree op = gimple_assign_rhs1 (next_stmt);
2700           stmt_vec_info def_stmt_info = loop_info->lookup_def (op);
2701
2702           /* Check that the other def is either defined in the loop
2703              ("vect_internal_def"), or it's an induction (defined by a
2704              loop-header phi-node).  */
2705           if (def_stmt_info
2706               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt))
2707               && vect_valid_reduction_input_p (def_stmt_info))
2708             {
2709               lhs = gimple_assign_lhs (next_stmt);
2710               continue;
2711             }
2712
2713           return false;
2714         }
2715       else
2716         {
2717           tree op = gimple_assign_rhs2 (next_stmt);
2718           stmt_vec_info def_stmt_info = loop_info->lookup_def (op);
2719
2720           /* Check that the other def is either defined in the loop
2721             ("vect_internal_def"), or it's an induction (defined by a
2722             loop-header phi-node).  */
2723           if (def_stmt_info
2724               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt))
2725               && vect_valid_reduction_input_p (def_stmt_info))
2726             {
2727               if (dump_enabled_p ())
2728                 dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: %G",
2729                                  next_stmt);
2730
2731               swap_ssa_operands (next_stmt,
2732                                  gimple_assign_rhs1_ptr (next_stmt),
2733                                  gimple_assign_rhs2_ptr (next_stmt));
2734               update_stmt (next_stmt);
2735
2736               if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2737                 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2738             }
2739           else
2740             return false;
2741         }
2742
2743       lhs = gimple_assign_lhs (next_stmt);
2744     }
2745
2746   /* Build up the actual chain.  */
2747   for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
2748     {
2749       REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
2750       REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
2751     }
2752   REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
2753   REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
2754
2755   /* Save the chain for further analysis in SLP detection.  */
2756   LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
2757   REDUC_GROUP_SIZE (reduc_chain[0]) = size;
2758
2759   return true;
2760 }
2761
2762 /* Return true if we need an in-order reduction for operation CODE
2763    on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
2764    overflow must wrap.  */
2765
2766 static bool
2767 needs_fold_left_reduction_p (tree type, tree_code code,
2768                              bool need_wrapping_integral_overflow)
2769 {
2770   /* CHECKME: check for !flag_finite_math_only too?  */
2771   if (SCALAR_FLOAT_TYPE_P (type))
2772     switch (code)
2773       {
2774       case MIN_EXPR:
2775       case MAX_EXPR:
2776         return false;
2777
2778       default:
2779         return !flag_associative_math;
2780       }
2781
2782   if (INTEGRAL_TYPE_P (type))
2783     {
2784       if (!operation_no_trapping_overflow (type, code))
2785         return true;
2786       if (need_wrapping_integral_overflow
2787           && !TYPE_OVERFLOW_WRAPS (type)
2788           && operation_can_overflow (code))
2789         return true;
2790       return false;
2791     }
2792
2793   if (SAT_FIXED_POINT_TYPE_P (type))
2794     return true;
2795
2796   return false;
2797 }
2798
2799 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
2800    reduction operation CODE has a handled computation expression.  */
2801
2802 bool
2803 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
2804                       tree loop_arg, enum tree_code code)
2805 {
2806   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
2807   auto_bitmap visited;
2808   tree lookfor = PHI_RESULT (phi);
2809   ssa_op_iter curri;
2810   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
2811   while (USE_FROM_PTR (curr) != loop_arg)
2812     curr = op_iter_next_use (&curri);
2813   curri.i = curri.numops;
2814   do
2815     {
2816       path.safe_push (std::make_pair (curri, curr));
2817       tree use = USE_FROM_PTR (curr);
2818       if (use == lookfor)
2819         break;
2820       gimple *def = SSA_NAME_DEF_STMT (use);
2821       if (gimple_nop_p (def)
2822           || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
2823         {
2824 pop:
2825           do
2826             {
2827               std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
2828               curri = x.first;
2829               curr = x.second;
2830               do
2831                 curr = op_iter_next_use (&curri);
2832               /* Skip already visited or non-SSA operands (from iterating
2833                  over PHI args).  */
2834               while (curr != NULL_USE_OPERAND_P
2835                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2836                          || ! bitmap_set_bit (visited,
2837                                               SSA_NAME_VERSION
2838                                                 (USE_FROM_PTR (curr)))));
2839             }
2840           while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
2841           if (curr == NULL_USE_OPERAND_P)
2842             break;
2843         }
2844       else
2845         {
2846           if (gimple_code (def) == GIMPLE_PHI)
2847             curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
2848           else
2849             curr = op_iter_init_use (&curri, def, SSA_OP_USE);
2850           while (curr != NULL_USE_OPERAND_P
2851                  && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2852                      || ! bitmap_set_bit (visited,
2853                                           SSA_NAME_VERSION
2854                                             (USE_FROM_PTR (curr)))))
2855             curr = op_iter_next_use (&curri);
2856           if (curr == NULL_USE_OPERAND_P)
2857             goto pop;
2858         }
2859     }
2860   while (1);
2861   if (dump_file && (dump_flags & TDF_DETAILS))
2862     {
2863       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
2864       unsigned i;
2865       std::pair<ssa_op_iter, use_operand_p> *x;
2866       FOR_EACH_VEC_ELT (path, i, x)
2867         dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
2868       dump_printf (MSG_NOTE, "\n");
2869     }
2870
2871   /* Check whether the reduction path detected is valid.  */
2872   bool fail = path.length () == 0;
2873   bool neg = false;
2874   for (unsigned i = 1; i < path.length (); ++i)
2875     {
2876       gimple *use_stmt = USE_STMT (path[i].second);
2877       tree op = USE_FROM_PTR (path[i].second);
2878       if (! has_single_use (op)
2879           || ! is_gimple_assign (use_stmt))
2880         {
2881           fail = true;
2882           break;
2883         }
2884       if (gimple_assign_rhs_code (use_stmt) != code)
2885         {
2886           if (code == PLUS_EXPR
2887               && gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
2888             {
2889               /* Track whether we negate the reduction value each iteration.  */
2890               if (gimple_assign_rhs2 (use_stmt) == op)
2891                 neg = ! neg;
2892             }
2893           else
2894             {
2895               fail = true;
2896               break;
2897             }
2898         }
2899     }
2900   return ! fail && ! neg;
2901 }
2902
2903
2904 /* Function vect_is_simple_reduction
2905
2906    (1) Detect a cross-iteration def-use cycle that represents a simple
2907    reduction computation.  We look for the following pattern:
2908
2909    loop_header:
2910      a1 = phi < a0, a2 >
2911      a3 = ...
2912      a2 = operation (a3, a1)
2913
2914    or
2915
2916    a3 = ...
2917    loop_header:
2918      a1 = phi < a0, a2 >
2919      a2 = operation (a3, a1)
2920
2921    such that:
2922    1. operation is commutative and associative and it is safe to
2923       change the order of the computation
2924    2. no uses for a2 in the loop (a2 is used out of the loop)
2925    3. no uses of a1 in the loop besides the reduction operation
2926    4. no uses of a1 outside the loop.
2927
2928    Conditions 1,4 are tested here.
2929    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2930
2931    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2932    nested cycles.
2933
2934    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2935    reductions:
2936
2937      a1 = phi < a0, a2 >
2938      inner loop (def of a3)
2939      a2 = phi < a3 >
2940
2941    (4) Detect condition expressions, ie:
2942      for (int i = 0; i < N; i++)
2943        if (a[i] < val)
2944         ret_val = a[i];
2945
2946 */
2947
2948 static stmt_vec_info
2949 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
2950                           bool *double_reduc,
2951                           bool need_wrapping_integral_overflow,
2952                           enum vect_reduction_type *v_reduc_type)
2953 {
2954   gphi *phi = as_a <gphi *> (phi_info->stmt);
2955   struct loop *loop = (gimple_bb (phi))->loop_father;
2956   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2957   bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
2958   gimple *phi_use_stmt = NULL;
2959   enum tree_code orig_code, code;
2960   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2961   tree type;
2962   tree name;
2963   imm_use_iterator imm_iter;
2964   use_operand_p use_p;
2965   bool phi_def;
2966
2967   *double_reduc = false;
2968   *v_reduc_type = TREE_CODE_REDUCTION;
2969
2970   tree phi_name = PHI_RESULT (phi);
2971   /* ???  If there are no uses of the PHI result the inner loop reduction
2972      won't be detected as possibly double-reduction by vectorizable_reduction
2973      because that tries to walk the PHI arg from the preheader edge which
2974      can be constant.  See PR60382.  */
2975   if (has_zero_uses (phi_name))
2976     return NULL;
2977   unsigned nphi_def_loop_uses = 0;
2978   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
2979     {
2980       gimple *use_stmt = USE_STMT (use_p);
2981       if (is_gimple_debug (use_stmt))
2982         continue;
2983
2984       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2985         {
2986           if (dump_enabled_p ())
2987             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2988                              "intermediate value used outside loop.\n");
2989
2990           return NULL;
2991         }
2992
2993       nphi_def_loop_uses++;
2994       phi_use_stmt = use_stmt;
2995     }
2996
2997   edge latch_e = loop_latch_edge (loop);
2998   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2999   if (TREE_CODE (loop_arg) != SSA_NAME)
3000     {
3001       if (dump_enabled_p ())
3002         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3003                          "reduction: not ssa_name: %T\n", loop_arg);
3004       return NULL;
3005     }
3006
3007   stmt_vec_info def_stmt_info = loop_info->lookup_def (loop_arg);
3008   if (!def_stmt_info
3009       || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
3010     return NULL;
3011
3012   if (gassign *def_stmt = dyn_cast <gassign *> (def_stmt_info->stmt))
3013     {
3014       name = gimple_assign_lhs (def_stmt);
3015       phi_def = false;
3016     }
3017   else if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
3018     {
3019       name = PHI_RESULT (def_stmt);
3020       phi_def = true;
3021     }
3022   else
3023     {
3024       if (dump_enabled_p ())
3025         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3026                          "reduction: unhandled reduction operation: %G",
3027                          def_stmt_info->stmt);
3028       return NULL;
3029     }
3030
3031   unsigned nlatch_def_loop_uses = 0;
3032   auto_vec<gphi *, 3> lcphis;
3033   bool inner_loop_of_double_reduc = false;
3034   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
3035     {
3036       gimple *use_stmt = USE_STMT (use_p);
3037       if (is_gimple_debug (use_stmt))
3038         continue;
3039       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3040         nlatch_def_loop_uses++;
3041       else
3042         {
3043           /* We can have more than one loop-closed PHI.  */
3044           lcphis.safe_push (as_a <gphi *> (use_stmt));
3045           if (nested_in_vect_loop
3046               && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
3047                   == vect_double_reduction_def))
3048             inner_loop_of_double_reduc = true;
3049         }
3050     }
3051
3052   /* If this isn't a nested cycle or if the nested cycle reduction value
3053      is used ouside of the inner loop we cannot handle uses of the reduction
3054      value.  */
3055   if ((!nested_in_vect_loop || inner_loop_of_double_reduc)
3056       && (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1))
3057     {
3058       if (dump_enabled_p ())
3059         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3060                          "reduction used in loop.\n");
3061       return NULL;
3062     }
3063
3064   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3065      defined in the inner loop.  */
3066   if (phi_def)
3067     {
3068       gphi *def_stmt = as_a <gphi *> (def_stmt_info->stmt);
3069       op1 = PHI_ARG_DEF (def_stmt, 0);
3070
3071       if (gimple_phi_num_args (def_stmt) != 1
3072           || TREE_CODE (op1) != SSA_NAME)
3073         {
3074           if (dump_enabled_p ())
3075             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3076                              "unsupported phi node definition.\n");
3077
3078           return NULL;
3079         }
3080
3081       gimple *def1 = SSA_NAME_DEF_STMT (op1);
3082       if (gimple_bb (def1)
3083           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3084           && loop->inner
3085           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3086           && is_gimple_assign (def1)
3087           && is_a <gphi *> (phi_use_stmt)
3088           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3089         {
3090           if (dump_enabled_p ())
3091             report_vect_op (MSG_NOTE, def_stmt,
3092                             "detected double reduction: ");
3093
3094           *double_reduc = true;
3095           return def_stmt_info;
3096         }
3097
3098       return NULL;
3099     }
3100
3101   /* If we are vectorizing an inner reduction we are executing that
3102      in the original order only in case we are not dealing with a
3103      double reduction.  */
3104   bool check_reduction = true;
3105   if (flow_loop_nested_p (vect_loop, loop))
3106     {
3107       gphi *lcphi;
3108       unsigned i;
3109       check_reduction = false;
3110       FOR_EACH_VEC_ELT (lcphis, i, lcphi)
3111         FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result (lcphi))
3112           {
3113             gimple *use_stmt = USE_STMT (use_p);
3114             if (is_gimple_debug (use_stmt))
3115               continue;
3116             if (! flow_bb_inside_loop_p (vect_loop, gimple_bb (use_stmt)))
3117               check_reduction = true;
3118           }
3119     }
3120
3121   gassign *def_stmt = as_a <gassign *> (def_stmt_info->stmt);
3122   code = orig_code = gimple_assign_rhs_code (def_stmt);
3123
3124   if (nested_in_vect_loop && !check_reduction)
3125     {
3126       /* FIXME: Even for non-reductions code generation is funneled
3127          through vectorizable_reduction for the stmt defining the
3128          PHI latch value.  So we have to artificially restrict ourselves
3129          for the supported operations.  */
3130       switch (get_gimple_rhs_class (code))
3131         {
3132         case GIMPLE_BINARY_RHS:
3133         case GIMPLE_TERNARY_RHS:
3134           break;
3135         default:
3136           /* Not supported by vectorizable_reduction.  */
3137           if (dump_enabled_p ())
3138             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3139                             "nested cycle: not handled operation: ");
3140           return NULL;
3141         }
3142       if (dump_enabled_p ())
3143         report_vect_op (MSG_NOTE, def_stmt, "detected nested cycle: ");
3144       return def_stmt_info;
3145     }
3146
3147   /* We can handle "res -= x[i]", which is non-associative by
3148      simply rewriting this into "res += -x[i]".  Avoid changing
3149      gimple instruction for the first simple tests and only do this
3150      if we're allowed to change code at all.  */
3151   if (code == MINUS_EXPR && gimple_assign_rhs2 (def_stmt) != phi_name)
3152     code = PLUS_EXPR;
3153
3154   if (code == COND_EXPR)
3155     {
3156       if (! nested_in_vect_loop)
3157         *v_reduc_type = COND_REDUCTION;
3158
3159       op3 = gimple_assign_rhs1 (def_stmt);
3160       if (COMPARISON_CLASS_P (op3))
3161         {
3162           op4 = TREE_OPERAND (op3, 1);
3163           op3 = TREE_OPERAND (op3, 0);
3164         }
3165       if (op3 == phi_name || op4 == phi_name)
3166         {
3167           if (dump_enabled_p ())
3168             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3169                             "reduction: condition depends on previous"
3170                             " iteration: ");
3171           return NULL;
3172         }
3173
3174       op1 = gimple_assign_rhs2 (def_stmt);
3175       op2 = gimple_assign_rhs3 (def_stmt);
3176     }
3177   else if (!commutative_tree_code (code) || !associative_tree_code (code))
3178     {
3179       if (dump_enabled_p ())
3180         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3181                         "reduction: not commutative/associative: ");
3182       return NULL;
3183     }
3184   else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS)
3185     {
3186       op1 = gimple_assign_rhs1 (def_stmt);
3187       op2 = gimple_assign_rhs2 (def_stmt);
3188     }
3189   else
3190     {
3191       if (dump_enabled_p ())
3192         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3193                         "reduction: not handled operation: ");
3194       return NULL;
3195     }
3196
3197   if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
3198     {
3199       if (dump_enabled_p ())
3200         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3201                         "reduction: both uses not ssa_names: ");
3202
3203       return NULL;
3204     }
3205
3206   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
3207   if ((TREE_CODE (op1) == SSA_NAME
3208        && !types_compatible_p (type,TREE_TYPE (op1)))
3209       || (TREE_CODE (op2) == SSA_NAME
3210           && !types_compatible_p (type, TREE_TYPE (op2)))
3211       || (op3 && TREE_CODE (op3) == SSA_NAME
3212           && !types_compatible_p (type, TREE_TYPE (op3)))
3213       || (op4 && TREE_CODE (op4) == SSA_NAME
3214           && !types_compatible_p (type, TREE_TYPE (op4))))
3215     {
3216       if (dump_enabled_p ())
3217         {
3218           dump_printf_loc (MSG_NOTE, vect_location,
3219                            "reduction: multiple types: operation type: "
3220                            "%T, operands types: %T,%T",
3221                            type,  TREE_TYPE (op1), TREE_TYPE (op2));
3222           if (op3)
3223             dump_printf (MSG_NOTE, ",%T", TREE_TYPE (op3));
3224
3225           if (op4)
3226             dump_printf (MSG_NOTE, ",%T", TREE_TYPE (op4));
3227           dump_printf (MSG_NOTE, "\n");
3228         }
3229
3230       return NULL;
3231     }
3232
3233   /* Check whether it's ok to change the order of the computation.
3234      Generally, when vectorizing a reduction we change the order of the
3235      computation.  This may change the behavior of the program in some
3236      cases, so we need to check that this is ok.  One exception is when
3237      vectorizing an outer-loop: the inner-loop is executed sequentially,
3238      and therefore vectorizing reductions in the inner-loop during
3239      outer-loop vectorization is safe.  */
3240   if (check_reduction
3241       && *v_reduc_type == TREE_CODE_REDUCTION
3242       && needs_fold_left_reduction_p (type, code,
3243                                       need_wrapping_integral_overflow))
3244     *v_reduc_type = FOLD_LEFT_REDUCTION;
3245
3246   /* Reduction is safe. We're dealing with one of the following:
3247      1) integer arithmetic and no trapv
3248      2) floating point arithmetic, and special flags permit this optimization
3249      3) nested cycle (i.e., outer loop vectorization).  */
3250   stmt_vec_info def1_info = loop_info->lookup_def (op1);
3251   stmt_vec_info def2_info = loop_info->lookup_def (op2);
3252   if (code != COND_EXPR && !def1_info && !def2_info)
3253     {
3254       if (dump_enabled_p ())
3255         report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
3256       return NULL;
3257     }
3258
3259   /* Check that one def is the reduction def, defined by PHI,
3260      the other def is either defined in the loop ("vect_internal_def"),
3261      or it's an induction (defined by a loop-header phi-node).  */
3262
3263   if (def2_info
3264       && def2_info->stmt == phi
3265       && (code == COND_EXPR
3266           || !def1_info
3267           || !flow_bb_inside_loop_p (loop, gimple_bb (def1_info->stmt))
3268           || vect_valid_reduction_input_p (def1_info)))
3269     {
3270       if (dump_enabled_p ())
3271         report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3272       return def_stmt_info;
3273     }
3274
3275   if (def1_info
3276       && def1_info->stmt == phi
3277       && (code == COND_EXPR
3278           || !def2_info
3279           || !flow_bb_inside_loop_p (loop, gimple_bb (def2_info->stmt))
3280           || vect_valid_reduction_input_p (def2_info)))
3281     {
3282       if (! nested_in_vect_loop && orig_code != MINUS_EXPR)
3283         {
3284           /* Check if we can swap operands (just for simplicity - so that
3285              the rest of the code can assume that the reduction variable
3286              is always the last (second) argument).  */
3287           if (code == COND_EXPR)
3288             {
3289               /* Swap cond_expr by inverting the condition.  */
3290               tree cond_expr = gimple_assign_rhs1 (def_stmt);
3291               enum tree_code invert_code = ERROR_MARK;
3292               enum tree_code cond_code = TREE_CODE (cond_expr);
3293
3294               if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
3295                 {
3296                   bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
3297                   invert_code = invert_tree_comparison (cond_code, honor_nans);
3298                 }
3299               if (invert_code != ERROR_MARK)
3300                 {
3301                   TREE_SET_CODE (cond_expr, invert_code);
3302                   swap_ssa_operands (def_stmt,
3303                                      gimple_assign_rhs2_ptr (def_stmt),
3304                                      gimple_assign_rhs3_ptr (def_stmt));
3305                 }
3306               else
3307                 {
3308                   if (dump_enabled_p ())
3309                     report_vect_op (MSG_NOTE, def_stmt,
3310                                     "detected reduction: cannot swap operands "
3311                                     "for cond_expr");
3312                   return NULL;
3313                 }
3314             }
3315           else
3316             swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
3317                                gimple_assign_rhs2_ptr (def_stmt));
3318
3319           if (dump_enabled_p ())
3320             report_vect_op (MSG_NOTE, def_stmt,
3321                             "detected reduction: need to swap operands: ");
3322
3323           if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
3324             LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
3325         }
3326       else
3327         {
3328           if (dump_enabled_p ())
3329             report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3330         }
3331
3332       return def_stmt_info;
3333     }
3334
3335   /* Try to find SLP reduction chain.  */
3336   if (! nested_in_vect_loop
3337       && code != COND_EXPR
3338       && orig_code != MINUS_EXPR
3339       && vect_is_slp_reduction (loop_info, phi, def_stmt))
3340     {
3341       if (dump_enabled_p ())
3342         report_vect_op (MSG_NOTE, def_stmt,
3343                         "reduction: detected reduction chain: ");
3344
3345       return def_stmt_info;
3346     }
3347
3348   /* Look for the expression computing loop_arg from loop PHI result.  */
3349   if (check_reduction_path (vect_location, loop, phi, loop_arg, code))
3350     return def_stmt_info;
3351
3352   if (dump_enabled_p ())
3353     {
3354       report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3355                       "reduction: unknown pattern: ");
3356     }
3357
3358   return NULL;
3359 }
3360
3361 /* Wrapper around vect_is_simple_reduction, which will modify code
3362    in-place if it enables detection of more reductions.  Arguments
3363    as there.  */
3364
3365 stmt_vec_info
3366 vect_force_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3367                              bool *double_reduc,
3368                              bool need_wrapping_integral_overflow)
3369 {
3370   enum vect_reduction_type v_reduc_type;
3371   stmt_vec_info def_info
3372     = vect_is_simple_reduction (loop_info, phi_info, double_reduc,
3373                                 need_wrapping_integral_overflow,
3374                                 &v_reduc_type);
3375   if (def_info)
3376     {
3377       STMT_VINFO_REDUC_TYPE (phi_info) = v_reduc_type;
3378       STMT_VINFO_REDUC_DEF (phi_info) = def_info;
3379       STMT_VINFO_REDUC_TYPE (def_info) = v_reduc_type;
3380       STMT_VINFO_REDUC_DEF (def_info) = phi_info;
3381     }
3382   return def_info;
3383 }
3384
3385 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
3386 int
3387 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3388                              int *peel_iters_epilogue,
3389                              stmt_vector_for_cost *scalar_cost_vec,
3390                              stmt_vector_for_cost *prologue_cost_vec,
3391                              stmt_vector_for_cost *epilogue_cost_vec)
3392 {
3393   int retval = 0;
3394   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3395
3396   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3397     {
3398       *peel_iters_epilogue = assumed_vf / 2;
3399       if (dump_enabled_p ())
3400         dump_printf_loc (MSG_NOTE, vect_location,
3401                          "cost model: epilogue peel iters set to vf/2 "
3402                          "because loop iterations are unknown .\n");
3403
3404       /* If peeled iterations are known but number of scalar loop
3405          iterations are unknown, count a taken branch per peeled loop.  */
3406       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3407                                  NULL, 0, vect_prologue);
3408       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3409                                  NULL, 0, vect_epilogue);
3410     }
3411   else
3412     {
3413       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3414       peel_iters_prologue = niters < peel_iters_prologue ?
3415                             niters : peel_iters_prologue;
3416       *peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3417       /* If we need to peel for gaps, but no peeling is required, we have to
3418          peel VF iterations.  */
3419       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3420         *peel_iters_epilogue = assumed_vf;
3421     }
3422
3423   stmt_info_for_cost *si;
3424   int j;
3425   if (peel_iters_prologue)
3426     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3427       retval += record_stmt_cost (prologue_cost_vec,
3428                                   si->count * peel_iters_prologue,
3429                                   si->kind, si->stmt_info, si->misalign,
3430                                   vect_prologue);
3431   if (*peel_iters_epilogue)
3432     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3433       retval += record_stmt_cost (epilogue_cost_vec,
3434                                   si->count * *peel_iters_epilogue,
3435                                   si->kind, si->stmt_info, si->misalign,
3436                                   vect_epilogue);
3437
3438   return retval;
3439 }
3440
3441 /* Function vect_estimate_min_profitable_iters
3442
3443    Return the number of iterations required for the vector version of the
3444    loop to be profitable relative to the cost of the scalar version of the
3445    loop.
3446
3447    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3448    of iterations for vectorization.  -1 value means loop vectorization
3449    is not profitable.  This returned value may be used for dynamic
3450    profitability check.
3451
3452    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3453    for static check against estimated number of iterations.  */
3454
3455 static void
3456 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3457                                     int *ret_min_profitable_niters,
3458                                     int *ret_min_profitable_estimate)
3459 {
3460   int min_profitable_iters;
3461   int min_profitable_estimate;
3462   int peel_iters_prologue;
3463   int peel_iters_epilogue;
3464   unsigned vec_inside_cost = 0;
3465   int vec_outside_cost = 0;
3466   unsigned vec_prologue_cost = 0;
3467   unsigned vec_epilogue_cost = 0;
3468   int scalar_single_iter_cost = 0;
3469   int scalar_outside_cost = 0;
3470   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3471   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3472   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3473
3474   /* Cost model disabled.  */
3475   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3476     {
3477       if (dump_enabled_p ())
3478         dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3479       *ret_min_profitable_niters = 0;
3480       *ret_min_profitable_estimate = 0;
3481       return;
3482     }
3483
3484   /* Requires loop versioning tests to handle misalignment.  */
3485   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3486     {
3487       /*  FIXME: Make cost depend on complexity of individual check.  */
3488       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3489       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3490                             vect_prologue);
3491       if (dump_enabled_p ())
3492         dump_printf (MSG_NOTE,
3493                      "cost model: Adding cost of checks for loop "
3494                      "versioning to treat misalignment.\n");
3495     }
3496
3497   /* Requires loop versioning with alias checks.  */
3498   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3499     {
3500       /*  FIXME: Make cost depend on complexity of individual check.  */
3501       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3502       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3503                             vect_prologue);
3504       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3505       if (len)
3506         /* Count LEN - 1 ANDs and LEN comparisons.  */
3507         (void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt,
3508                               NULL, 0, vect_prologue);
3509       len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3510       if (len)
3511         {
3512           /* Count LEN - 1 ANDs and LEN comparisons.  */
3513           unsigned int nstmts = len * 2 - 1;
3514           /* +1 for each bias that needs adding.  */
3515           for (unsigned int i = 0; i < len; ++i)
3516             if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3517               nstmts += 1;
3518           (void) add_stmt_cost (target_cost_data, nstmts, scalar_stmt,
3519                                 NULL, 0, vect_prologue);
3520         }
3521       if (dump_enabled_p ())
3522         dump_printf (MSG_NOTE,
3523                      "cost model: Adding cost of checks for loop "
3524                      "versioning aliasing.\n");
3525     }
3526
3527   /* Requires loop versioning with niter checks.  */
3528   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3529     {
3530       /*  FIXME: Make cost depend on complexity of individual check.  */
3531       (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3532                             vect_prologue);
3533       if (dump_enabled_p ())
3534         dump_printf (MSG_NOTE,
3535                      "cost model: Adding cost of checks for loop "
3536                      "versioning niters.\n");
3537     }
3538
3539   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3540     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3541                           vect_prologue);
3542
3543   /* Count statements in scalar loop.  Using this as scalar cost for a single
3544      iteration for now.
3545
3546      TODO: Add outer loop support.
3547
3548      TODO: Consider assigning different costs to different scalar
3549      statements.  */
3550
3551   scalar_single_iter_cost
3552     = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3553
3554   /* Add additional cost for the peeled instructions in prologue and epilogue
3555      loop.  (For fully-masked loops there will be no peeling.)
3556
3557      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3558      at compile-time - we assume it's vf/2 (the worst would be vf-1).
3559
3560      TODO: Build an expression that represents peel_iters for prologue and
3561      epilogue to be used in a run-time test.  */
3562
3563   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3564     {
3565       peel_iters_prologue = 0;
3566       peel_iters_epilogue = 0;
3567
3568       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3569         {
3570           /* We need to peel exactly one iteration.  */
3571           peel_iters_epilogue += 1;
3572           stmt_info_for_cost *si;
3573           int j;
3574           FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
3575                             j, si)
3576             (void) add_stmt_cost (target_cost_data, si->count,
3577                                   si->kind, si->stmt_info, si->misalign,
3578                                   vect_epilogue);
3579         }
3580     }
3581   else if (npeel < 0)
3582     {
3583       peel_iters_prologue = assumed_vf / 2;
3584       if (dump_enabled_p ())
3585         dump_printf (MSG_NOTE, "cost model: "
3586                      "prologue peel iters set to vf/2.\n");
3587
3588       /* If peeling for alignment is unknown, loop bound of main loop becomes
3589          unknown.  */
3590       peel_iters_epilogue = assumed_vf / 2;
3591       if (dump_enabled_p ())
3592         dump_printf (MSG_NOTE, "cost model: "
3593                      "epilogue peel iters set to vf/2 because "
3594                      "peeling for alignment is unknown.\n");
3595
3596       /* If peeled iterations are unknown, count a taken branch and a not taken
3597          branch per peeled loop. Even if scalar loop iterations are known,
3598          vector iterations are not known since peeled prologue iterations are
3599          not known. Hence guards remain the same.  */
3600       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3601                             NULL, 0, vect_prologue);
3602       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3603                             NULL, 0, vect_prologue);
3604       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3605                             NULL, 0, vect_epilogue);
3606       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3607                             NULL, 0, vect_epilogue);
3608       stmt_info_for_cost *si;
3609       int j;
3610       FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3611         {
3612           (void) add_stmt_cost (target_cost_data,
3613                                 si->count * peel_iters_prologue,
3614                                 si->kind, si->stmt_info, si->misalign,
3615                                 vect_prologue);
3616           (void) add_stmt_cost (target_cost_data,
3617                                 si->count * peel_iters_epilogue,
3618                                 si->kind, si->stmt_info, si->misalign,
3619                                 vect_epilogue);
3620         }
3621     }
3622   else
3623     {
3624       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3625       stmt_info_for_cost *si;
3626       int j;
3627       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3628
3629       prologue_cost_vec.create (2);
3630       epilogue_cost_vec.create (2);
3631       peel_iters_prologue = npeel;
3632
3633       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3634                                           &peel_iters_epilogue,
3635                                           &LOOP_VINFO_SCALAR_ITERATION_COST
3636                                             (loop_vinfo),
3637                                           &prologue_cost_vec,
3638                                           &epilogue_cost_vec);
3639
3640       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3641         (void) add_stmt_cost (data, si->count, si->kind, si->stmt_info,
3642                               si->misalign, vect_prologue);
3643
3644       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3645         (void) add_stmt_cost (data, si->count, si->kind, si->stmt_info,
3646                               si->misalign, vect_epilogue);
3647
3648       prologue_cost_vec.release ();
3649       epilogue_cost_vec.release ();
3650     }
3651
3652   /* FORNOW: The scalar outside cost is incremented in one of the
3653      following ways:
3654
3655      1. The vectorizer checks for alignment and aliasing and generates
3656      a condition that allows dynamic vectorization.  A cost model
3657      check is ANDED with the versioning condition.  Hence scalar code
3658      path now has the added cost of the versioning check.
3659
3660        if (cost > th & versioning_check)
3661          jmp to vector code
3662
3663      Hence run-time scalar is incremented by not-taken branch cost.
3664
3665      2. The vectorizer then checks if a prologue is required.  If the
3666      cost model check was not done before during versioning, it has to
3667      be done before the prologue check.
3668
3669        if (cost <= th)
3670          prologue = scalar_iters
3671        if (prologue == 0)
3672          jmp to vector code
3673        else
3674          execute prologue
3675        if (prologue == num_iters)
3676          go to exit
3677
3678      Hence the run-time scalar cost is incremented by a taken branch,
3679      plus a not-taken branch, plus a taken branch cost.
3680
3681      3. The vectorizer then checks if an epilogue is required.  If the
3682      cost model check was not done before during prologue check, it
3683      has to be done with the epilogue check.
3684
3685        if (prologue == 0)
3686          jmp to vector code
3687        else
3688          execute prologue
3689        if (prologue == num_iters)
3690          go to exit
3691        vector code:
3692          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3693            jmp to epilogue
3694
3695      Hence the run-time scalar cost should be incremented by 2 taken
3696      branches.
3697
3698      TODO: The back end may reorder the BBS's differently and reverse
3699      conditions/branch directions.  Change the estimates below to
3700      something more reasonable.  */
3701
3702   /* If the number of iterations is known and we do not do versioning, we can
3703      decide whether to vectorize at compile time.  Hence the scalar version
3704      do not carry cost model guard costs.  */
3705   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3706       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3707     {
3708       /* Cost model check occurs at versioning.  */
3709       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3710         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3711       else
3712         {
3713           /* Cost model check occurs at prologue generation.  */
3714           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3715             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3716               + vect_get_stmt_cost (cond_branch_not_taken);
3717           /* Cost model check occurs at epilogue generation.  */
3718           else
3719             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3720         }
3721     }
3722
3723   /* Complete the target-specific cost calculations.  */
3724   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3725                &vec_inside_cost, &vec_epilogue_cost);
3726
3727   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3728
3729   if (dump_enabled_p ())
3730     {
3731       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3732       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
3733                    vec_inside_cost);
3734       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
3735                    vec_prologue_cost);
3736       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
3737                    vec_epilogue_cost);
3738       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
3739                    scalar_single_iter_cost);
3740       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
3741                    scalar_outside_cost);
3742       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
3743                    vec_outside_cost);
3744       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
3745                    peel_iters_prologue);
3746       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
3747                    peel_iters_epilogue);
3748     }
3749
3750   /* Calculate number of iterations required to make the vector version
3751      profitable, relative to the loop bodies only.  The following condition
3752      must hold true:
3753      SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
3754      where
3755      SIC = scalar iteration cost, VIC = vector iteration cost,
3756      VOC = vector outside cost, VF = vectorization factor,
3757      NPEEL = prologue iterations + epilogue iterations,
3758      SOC = scalar outside cost for run time cost model check.  */
3759
3760   int saving_per_viter = (scalar_single_iter_cost * assumed_vf
3761                           - vec_inside_cost);
3762   if (saving_per_viter <= 0)
3763     {
3764       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3765         warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
3766                     "vectorization did not happen for a simd loop");
3767
3768       if (dump_enabled_p ())
3769         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3770                          "cost model: the vector iteration cost = %d "
3771                          "divided by the scalar iteration cost = %d "
3772                          "is greater or equal to the vectorization factor = %d"
3773                          ".\n",
3774                          vec_inside_cost, scalar_single_iter_cost, assumed_vf);
3775       *ret_min_profitable_niters = -1;
3776       *ret_min_profitable_estimate = -1;
3777       return;
3778     }
3779
3780   /* ??? The "if" arm is written to handle all cases; see below for what
3781      we would do for !LOOP_VINFO_FULLY_MASKED_P.  */
3782   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3783     {
3784       /* Rewriting the condition above in terms of the number of
3785          vector iterations (vniters) rather than the number of
3786          scalar iterations (niters) gives:
3787
3788          SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
3789
3790          <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
3791
3792          For integer N, X and Y when X > 0:
3793
3794          N * X > Y <==> N >= (Y /[floor] X) + 1.  */
3795       int outside_overhead = (vec_outside_cost
3796                               - scalar_single_iter_cost * peel_iters_prologue
3797                               - scalar_single_iter_cost * peel_iters_epilogue
3798                               - scalar_outside_cost);
3799       /* We're only interested in cases that require at least one
3800          vector iteration.  */
3801       int min_vec_niters = 1;
3802       if (outside_overhead > 0)
3803         min_vec_niters = outside_overhead / saving_per_viter + 1;
3804
3805       if (dump_enabled_p ())
3806         dump_printf (MSG_NOTE, "  Minimum number of vector iterations: %d\n",
3807                      min_vec_niters);
3808
3809       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3810         {
3811           /* Now that we know the minimum number of vector iterations,
3812              find the minimum niters for which the scalar cost is larger:
3813
3814              SIC * niters > VIC * vniters + VOC - SOC
3815
3816              We know that the minimum niters is no more than
3817              vniters * VF + NPEEL, but it might be (and often is) less
3818              than that if a partial vector iteration is cheaper than the
3819              equivalent scalar code.  */
3820           int threshold = (vec_inside_cost * min_vec_niters
3821                            + vec_outside_cost
3822                            - scalar_outside_cost);
3823           if (threshold <= 0)
3824             min_profitable_iters = 1;
3825           else
3826             min_profitable_iters = threshold / scalar_single_iter_cost + 1;
3827         }
3828       else
3829         /* Convert the number of vector iterations into a number of
3830            scalar iterations.  */
3831         min_profitable_iters = (min_vec_niters * assumed_vf
3832                                 + peel_iters_prologue
3833                                 + peel_iters_epilogue);
3834     }
3835   else
3836     {
3837       min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
3838                               * assumed_vf
3839                               - vec_inside_cost * peel_iters_prologue
3840                               - vec_inside_cost * peel_iters_epilogue);
3841       if (min_profitable_iters <= 0)
3842         min_profitable_iters = 0;
3843       else
3844         {
3845           min_profitable_iters /= saving_per_viter;
3846
3847           if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
3848               <= (((int) vec_inside_cost * min_profitable_iters)
3849                   + (((int) vec_outside_cost - scalar_outside_cost)
3850                      * assumed_vf)))
3851             min_profitable_iters++;
3852         }
3853     }
3854
3855   if (dump_enabled_p ())
3856     dump_printf (MSG_NOTE,
3857                  "  Calculated minimum iters for profitability: %d\n",
3858                  min_profitable_iters);
3859
3860   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
3861       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
3862     /* We want the vectorized loop to execute at least once.  */
3863     min_profitable_iters = assumed_vf + peel_iters_prologue;
3864
3865   if (dump_enabled_p ())
3866     dump_printf_loc (MSG_NOTE, vect_location,
3867                      "  Runtime profitability threshold = %d\n",
3868                      min_profitable_iters);
3869
3870   *ret_min_profitable_niters = min_profitable_iters;
3871
3872   /* Calculate number of iterations required to make the vector version
3873      profitable, relative to the loop bodies only.
3874
3875      Non-vectorized variant is SIC * niters and it must win over vector
3876      variant on the expected loop trip count.  The following condition must hold true:
3877      SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC  */
3878
3879   if (vec_outside_cost <= 0)
3880     min_profitable_estimate = 0;
3881   else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3882     {
3883       /* This is a repeat of the code above, but with + SOC rather
3884          than - SOC.  */
3885       int outside_overhead = (vec_outside_cost
3886                               - scalar_single_iter_cost * peel_iters_prologue
3887                               - scalar_single_iter_cost * peel_iters_epilogue
3888                               + scalar_outside_cost);
3889       int min_vec_niters = 1;
3890       if (outside_overhead > 0)
3891         min_vec_niters = outside_overhead / saving_per_viter + 1;
3892
3893       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3894         {
3895           int threshold = (vec_inside_cost * min_vec_niters
3896                            + vec_outside_cost
3897                            + scalar_outside_cost);
3898           min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
3899         }
3900       else
3901         min_profitable_estimate = (min_vec_niters * assumed_vf
3902                                    + peel_iters_prologue
3903                                    + peel_iters_epilogue);
3904     }
3905   else
3906     {
3907       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
3908                                  * assumed_vf
3909                                  - vec_inside_cost * peel_iters_prologue
3910                                  - vec_inside_cost * peel_iters_epilogue)
3911                                  / ((scalar_single_iter_cost * assumed_vf)
3912                                    - vec_inside_cost);
3913     }
3914   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3915   if (dump_enabled_p ())
3916     dump_printf_loc (MSG_NOTE, vect_location,
3917                      "  Static estimate profitability threshold = %d\n",
3918                      min_profitable_estimate);
3919
3920   *ret_min_profitable_estimate = min_profitable_estimate;
3921 }
3922
3923 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3924    vector elements (not bits) for a vector with NELT elements.  */
3925 static void
3926 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
3927                               vec_perm_builder *sel)
3928 {
3929   /* The encoding is a single stepped pattern.  Any wrap-around is handled
3930      by vec_perm_indices.  */
3931   sel->new_vector (nelt, 1, 3);
3932   for (unsigned int i = 0; i < 3; i++)
3933     sel->quick_push (i + offset);
3934 }
3935
3936 /* Checks whether the target supports whole-vector shifts for vectors of mode
3937    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
3938    it supports vec_perm_const with masks for all necessary shift amounts.  */
3939 static bool
3940 have_whole_vector_shift (machine_mode mode)
3941 {
3942   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3943     return true;
3944
3945   /* Variable-length vectors should be handled via the optab.  */
3946   unsigned int nelt;
3947   if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
3948     return false;
3949
3950   vec_perm_builder sel;
3951   vec_perm_indices indices;
3952   for (unsigned int i = nelt / 2; i >= 1; i /= 2)
3953     {
3954       calc_vec_perm_mask_for_shift (i, nelt, &sel);
3955       indices.new_vector (sel, 2, nelt);
3956       if (!can_vec_perm_const_p (mode, indices, false))
3957         return false;
3958     }
3959   return true;
3960 }
3961
3962 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3963    functions. Design better to avoid maintenance issues.  */
3964
3965 /* Function vect_model_reduction_cost.
3966
3967    Models cost for a reduction operation, including the vector ops
3968    generated within the strip-mine loop, the initial definition before
3969    the loop, and the epilogue code that must be generated.  */
3970
3971 static void
3972 vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
3973                            int ncopies, stmt_vector_for_cost *cost_vec)
3974 {
3975   int prologue_cost = 0, epilogue_cost = 0, inside_cost;
3976   enum tree_code code;
3977   optab optab;
3978   tree vectype;
3979   machine_mode mode;
3980   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3981   struct loop *loop = NULL;
3982
3983   if (loop_vinfo)
3984     loop = LOOP_VINFO_LOOP (loop_vinfo);
3985
3986   /* Condition reductions generate two reductions in the loop.  */
3987   vect_reduction_type reduction_type
3988     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
3989   if (reduction_type == COND_REDUCTION)
3990     ncopies *= 2;
3991
3992   vectype = STMT_VINFO_VECTYPE (stmt_info);
3993   mode = TYPE_MODE (vectype);
3994   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
3995
3996   code = gimple_assign_rhs_code (orig_stmt_info->stmt);
3997
3998   if (reduction_type == EXTRACT_LAST_REDUCTION
3999       || reduction_type == FOLD_LEFT_REDUCTION)
4000     {
4001       /* No extra instructions needed in the prologue.  */
4002       prologue_cost = 0;
4003
4004       if (reduction_type == EXTRACT_LAST_REDUCTION || reduc_fn != IFN_LAST)
4005         /* Count one reduction-like operation per vector.  */
4006         inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
4007                                         stmt_info, 0, vect_body);
4008       else
4009         {
4010           /* Use NELEMENTS extracts and NELEMENTS scalar ops.  */
4011           unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
4012           inside_cost = record_stmt_cost (cost_vec, nelements,
4013                                           vec_to_scalar, stmt_info, 0,
4014                                           vect_body);
4015           inside_cost += record_stmt_cost (cost_vec, nelements,
4016                                            scalar_stmt, stmt_info, 0,
4017                                            vect_body);
4018         }
4019     }
4020   else
4021     {
4022       /* Add in cost for initial definition.
4023          For cond reduction we have four vectors: initial index, step,
4024          initial result of the data reduction, initial value of the index
4025          reduction.  */
4026       int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
4027       prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
4028                                          scalar_to_vec, stmt_info, 0,
4029                                          vect_prologue);
4030
4031       /* Cost of reduction op inside loop.  */
4032       inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
4033                                       stmt_info, 0, vect_body);
4034     }
4035
4036   /* Determine cost of epilogue code.
4037
4038      We have a reduction operator that will reduce the vector in one statement.
4039      Also requires scalar extract.  */
4040
4041   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
4042     {
4043       if (reduc_fn != IFN_LAST)
4044         {
4045           if (reduction_type == COND_REDUCTION)
4046             {
4047               /* An EQ stmt and an COND_EXPR stmt.  */
4048               epilogue_cost += record_stmt_cost (cost_vec, 2,
4049                                                  vector_stmt, stmt_info, 0,
4050                                                  vect_epilogue);
4051               /* Reduction of the max index and a reduction of the found
4052                  values.  */
4053               epilogue_cost += record_stmt_cost (cost_vec, 2,
4054                                                  vec_to_scalar, stmt_info, 0,
4055                                                  vect_epilogue);
4056               /* A broadcast of the max value.  */
4057               epilogue_cost += record_stmt_cost (cost_vec, 1,
4058                                                  scalar_to_vec, stmt_info, 0,
4059                                                  vect_epilogue);
4060             }
4061           else
4062             {
4063               epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
4064                                                  stmt_info, 0, vect_epilogue);
4065               epilogue_cost += record_stmt_cost (cost_vec, 1,
4066                                                  vec_to_scalar, stmt_info, 0,
4067                                                  vect_epilogue);
4068             }
4069         }
4070       else if (reduction_type == COND_REDUCTION)
4071         {
4072           unsigned estimated_nunits = vect_nunits_for_cost (vectype);
4073           /* Extraction of scalar elements.  */
4074           epilogue_cost += record_stmt_cost (cost_vec,
4075                                              2 * estimated_nunits,
4076                                              vec_to_scalar, stmt_info, 0,
4077                                              vect_epilogue);
4078           /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
4079           epilogue_cost += record_stmt_cost (cost_vec,
4080                                              2 * estimated_nunits - 3,
4081                                              scalar_stmt, stmt_info, 0,
4082                                              vect_epilogue);
4083         }
4084       else if (reduction_type == EXTRACT_LAST_REDUCTION
4085                || reduction_type == FOLD_LEFT_REDUCTION)
4086         /* No extra instructions need in the epilogue.  */
4087         ;
4088       else
4089         {
4090           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4091           tree bitsize =
4092             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt_info->stmt)));
4093           int element_bitsize = tree_to_uhwi (bitsize);
4094           int nelements = vec_size_in_bits / element_bitsize;
4095
4096           if (code == COND_EXPR)
4097             code = MAX_EXPR;
4098
4099           optab = optab_for_tree_code (code, vectype, optab_default);
4100
4101           /* We have a whole vector shift available.  */
4102           if (optab != unknown_optab
4103               && VECTOR_MODE_P (mode)
4104               && optab_handler (optab, mode) != CODE_FOR_nothing
4105               && have_whole_vector_shift (mode))
4106             {
4107               /* Final reduction via vector shifts and the reduction operator.
4108                  Also requires scalar extract.  */
4109               epilogue_cost += record_stmt_cost (cost_vec,
4110                                                  exact_log2 (nelements) * 2,
4111                                                  vector_stmt, stmt_info, 0,
4112                                                  vect_epilogue);
4113               epilogue_cost += record_stmt_cost (cost_vec, 1,
4114                                                  vec_to_scalar, stmt_info, 0,
4115                                                  vect_epilogue);
4116             }
4117           else
4118             /* Use extracts and reduction op for final reduction.  For N
4119                elements, we have N extracts and N-1 reduction ops.  */
4120             epilogue_cost += record_stmt_cost (cost_vec,
4121                                                nelements + nelements - 1,
4122                                                vector_stmt, stmt_info, 0,
4123                                                vect_epilogue);
4124         }
4125     }
4126
4127   if (dump_enabled_p ())
4128     dump_printf (MSG_NOTE,
4129                  "vect_model_reduction_cost: inside_cost = %d, "
4130                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4131                  prologue_cost, epilogue_cost);
4132 }
4133
4134
4135 /* Function vect_model_induction_cost.
4136
4137    Models cost for induction operations.  */
4138
4139 static void
4140 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies,
4141                            stmt_vector_for_cost *cost_vec)
4142 {
4143   unsigned inside_cost, prologue_cost;
4144
4145   if (PURE_SLP_STMT (stmt_info))
4146     return;
4147
4148   /* loop cost for vec_loop.  */
4149   inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
4150                                   stmt_info, 0, vect_body);
4151
4152   /* prologue cost for vec_init and vec_step.  */
4153   prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
4154                                     stmt_info, 0, vect_prologue);
4155
4156   if (dump_enabled_p ())
4157     dump_printf_loc (MSG_NOTE, vect_location,
4158                      "vect_model_induction_cost: inside_cost = %d, "
4159                      "prologue_cost = %d .\n", inside_cost, prologue_cost);
4160 }
4161
4162
4163
4164 /* Function get_initial_def_for_reduction
4165
4166    Input:
4167    STMT_VINFO - a stmt that performs a reduction operation in the loop.
4168    INIT_VAL - the initial value of the reduction variable
4169
4170    Output:
4171    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4172         of the reduction (used for adjusting the epilog - see below).
4173    Return a vector variable, initialized according to the operation that
4174         STMT_VINFO performs. This vector will be used as the initial value
4175         of the vector of partial results.
4176
4177    Option1 (adjust in epilog): Initialize the vector as follows:
4178      add/bit or/xor:    [0,0,...,0,0]
4179      mult/bit and:      [1,1,...,1,1]
4180      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4181    and when necessary (e.g. add/mult case) let the caller know
4182    that it needs to adjust the result by init_val.
4183
4184    Option2: Initialize the vector as follows:
4185      add/bit or/xor:    [init_val,0,0,...,0]
4186      mult/bit and:      [init_val,1,1,...,1]
4187      min/max/cond_expr: [init_val,init_val,...,init_val]
4188    and no adjustments are needed.
4189
4190    For example, for the following code:
4191
4192    s = init_val;
4193    for (i=0;i<n;i++)
4194      s = s + a[i];
4195
4196    STMT_VINFO is 's = s + a[i]', and the reduction variable is 's'.
4197    For a vector of 4 units, we want to return either [0,0,0,init_val],
4198    or [0,0,0,0] and let the caller know that it needs to adjust
4199    the result at the end by 'init_val'.
4200
4201    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4202    initialization vector is simpler (same element in all entries), if
4203    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4204
4205    A cost model should help decide between these two schemes.  */
4206
4207 tree
4208 get_initial_def_for_reduction (stmt_vec_info stmt_vinfo, tree init_val,
4209                                tree *adjustment_def)
4210 {
4211   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
4212   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4213   tree scalar_type = TREE_TYPE (init_val);
4214   tree vectype = get_vectype_for_scalar_type (scalar_type);
4215   enum tree_code code = gimple_assign_rhs_code (stmt_vinfo->stmt);
4216   tree def_for_init;
4217   tree init_def;
4218   REAL_VALUE_TYPE real_init_val = dconst0;
4219   int int_init_val = 0;
4220   gimple_seq stmts = NULL;
4221
4222   gcc_assert (vectype);
4223
4224   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4225               || SCALAR_FLOAT_TYPE_P (scalar_type));
4226
4227   gcc_assert (nested_in_vect_loop_p (loop, stmt_vinfo)
4228               || loop == (gimple_bb (stmt_vinfo->stmt))->loop_father);
4229
4230   vect_reduction_type reduction_type
4231     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo);
4232
4233   switch (code)
4234     {
4235     case WIDEN_SUM_EXPR:
4236     case DOT_PROD_EXPR:
4237     case SAD_EXPR:
4238     case PLUS_EXPR:
4239     case MINUS_EXPR:
4240     case BIT_IOR_EXPR:
4241     case BIT_XOR_EXPR:
4242     case MULT_EXPR:
4243     case BIT_AND_EXPR:
4244       {
4245         /* ADJUSTMENT_DEF is NULL when called from
4246            vect_create_epilog_for_reduction to vectorize double reduction.  */
4247         if (adjustment_def)
4248           *adjustment_def = init_val;
4249
4250         if (code == MULT_EXPR)
4251           {
4252             real_init_val = dconst1;
4253             int_init_val = 1;
4254           }
4255
4256         if (code == BIT_AND_EXPR)
4257           int_init_val = -1;
4258
4259         if (SCALAR_FLOAT_TYPE_P (scalar_type))
4260           def_for_init = build_real (scalar_type, real_init_val);
4261         else
4262           def_for_init = build_int_cst (scalar_type, int_init_val);
4263
4264         if (adjustment_def)
4265           /* Option1: the first element is '0' or '1' as well.  */
4266           init_def = gimple_build_vector_from_val (&stmts, vectype,
4267                                                    def_for_init);
4268         else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4269           {
4270             /* Option2 (variable length): the first element is INIT_VAL.  */
4271             init_def = gimple_build_vector_from_val (&stmts, vectype,
4272                                                      def_for_init);
4273             init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4274                                      vectype, init_def, init_val);
4275           }
4276         else
4277           {
4278             /* Option2: the first element is INIT_VAL.  */
4279             tree_vector_builder elts (vectype, 1, 2);
4280             elts.quick_push (init_val);
4281             elts.quick_push (def_for_init);
4282             init_def = gimple_build_vector (&stmts, &elts);
4283           }
4284       }
4285       break;
4286
4287     case MIN_EXPR:
4288     case MAX_EXPR:
4289     case COND_EXPR:
4290       {
4291         if (adjustment_def)
4292           {
4293             *adjustment_def = NULL_TREE;
4294             if (reduction_type != COND_REDUCTION
4295                 && reduction_type != EXTRACT_LAST_REDUCTION)
4296               {
4297                 init_def = vect_get_vec_def_for_operand (init_val, stmt_vinfo);
4298                 break;
4299               }
4300           }
4301         init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4302         init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4303       }
4304       break;
4305
4306     default:
4307       gcc_unreachable ();
4308     }
4309
4310   if (stmts)
4311     gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4312   return init_def;
4313 }
4314
4315 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4316    NUMBER_OF_VECTORS is the number of vector defs to create.
4317    If NEUTRAL_OP is nonnull, introducing extra elements of that
4318    value will not change the result.  */
4319
4320 static void
4321 get_initial_defs_for_reduction (slp_tree slp_node,
4322                                 vec<tree> *vec_oprnds,
4323                                 unsigned int number_of_vectors,
4324                                 bool reduc_chain, tree neutral_op)
4325 {
4326   vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4327   stmt_vec_info stmt_vinfo = stmts[0];
4328   unsigned HOST_WIDE_INT nunits;
4329   unsigned j, number_of_places_left_in_vector;
4330   tree vector_type;
4331   unsigned int group_size = stmts.length ();
4332   unsigned int i;
4333   struct loop *loop;
4334
4335   vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4336
4337   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4338
4339   loop = (gimple_bb (stmt_vinfo->stmt))->loop_father;
4340   gcc_assert (loop);
4341   edge pe = loop_preheader_edge (loop);
4342
4343   gcc_assert (!reduc_chain || neutral_op);
4344
4345   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4346      created vectors. It is greater than 1 if unrolling is performed.
4347
4348      For example, we have two scalar operands, s1 and s2 (e.g., group of
4349      strided accesses of size two), while NUNITS is four (i.e., four scalars
4350      of this type can be packed in a vector).  The output vector will contain
4351      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
4352      will be 2).
4353
4354      If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4355      vectors containing the operands.
4356
4357      For example, NUNITS is four as before, and the group size is 8
4358      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
4359      {s5, s6, s7, s8}.  */
4360
4361   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4362     nunits = group_size;
4363
4364   number_of_places_left_in_vector = nunits;
4365   bool constant_p = true;
4366   tree_vector_builder elts (vector_type, nunits, 1);
4367   elts.quick_grow (nunits);
4368   gimple_seq ctor_seq = NULL;
4369   for (j = 0; j < nunits * number_of_vectors; ++j)
4370     {
4371       tree op;
4372       i = j % group_size;
4373       stmt_vinfo = stmts[i];
4374
4375       /* Get the def before the loop.  In reduction chain we have only
4376          one initial value.  Else we have as many as PHIs in the group.  */
4377       if (reduc_chain)
4378         op = j != 0 ? neutral_op : PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4379       else if (((vec_oprnds->length () + 1) * nunits
4380                 - number_of_places_left_in_vector >= group_size)
4381                && neutral_op)
4382         op = neutral_op;
4383       else
4384         op = PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4385
4386       /* Create 'vect_ = {op0,op1,...,opn}'.  */
4387       number_of_places_left_in_vector--;
4388       elts[nunits - number_of_places_left_in_vector - 1] = op;
4389       if (!CONSTANT_CLASS_P (op))
4390         constant_p = false;
4391
4392       if (number_of_places_left_in_vector == 0)
4393         {
4394           tree init;
4395           if (constant_p && !neutral_op
4396               ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4397               : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4398             /* Build the vector directly from ELTS.  */
4399             init = gimple_build_vector (&ctor_seq, &elts);
4400           else if (neutral_op)
4401             {
4402               /* Build a vector of the neutral value and shift the
4403                  other elements into place.  */
4404               init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4405                                                    neutral_op);
4406               int k = nunits;
4407               while (k > 0 && elts[k - 1] == neutral_op)
4408                 k -= 1;
4409               while (k > 0)
4410                 {
4411                   k -= 1;
4412                   init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4413                                        vector_type, init, elts[k]);
4414                 }
4415             }
4416           else
4417             {
4418               /* First time round, duplicate ELTS to fill the
4419                  required number of vectors.  */
4420               duplicate_and_interleave (&ctor_seq, vector_type, elts,
4421                                         number_of_vectors, *vec_oprnds);
4422               break;
4423             }
4424           vec_oprnds->quick_push (init);
4425
4426           number_of_places_left_in_vector = nunits;
4427           elts.new_vector (vector_type, nunits, 1);
4428           elts.quick_grow (nunits);
4429           constant_p = true;
4430         }
4431     }
4432   if (ctor_seq != NULL)
4433     gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4434 }
4435
4436
4437 /* Function vect_create_epilog_for_reduction
4438
4439    Create code at the loop-epilog to finalize the result of a reduction
4440    computation.
4441
4442    VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4443      reduction statements.
4444    STMT_INFO is the scalar reduction stmt that is being vectorized.
4445    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4446      number of elements that we can fit in a vectype (nunits).  In this case
4447      we have to generate more than one vector stmt - i.e - we need to "unroll"
4448      the vector stmt by a factor VF/nunits.  For more details see documentation
4449      in vectorizable_operation.
4450    REDUC_FN is the internal function for the epilog reduction.
4451    REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4452      computation.
4453    REDUC_INDEX is the index of the operand in the right hand side of the
4454      statement that is defined by REDUCTION_PHI.
4455    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4456    SLP_NODE is an SLP node containing a group of reduction statements. The
4457      first one in this group is STMT_INFO.
4458    INDUC_VAL is for INTEGER_INDUC_COND_REDUCTION the value to use for the case
4459      when the COND_EXPR is never true in the loop.  For MAX_EXPR, it needs to
4460      be smaller than any value of the IV in the loop, for MIN_EXPR larger than
4461      any value of the IV in the loop.
4462    INDUC_CODE is the code for epilog reduction if INTEGER_INDUC_COND_REDUCTION.
4463    NEUTRAL_OP is the value given by neutral_op_for_slp_reduction; it is
4464      null if this is not an SLP reduction
4465
4466    This function:
4467    1. Creates the reduction def-use cycles: sets the arguments for
4468       REDUCTION_PHIS:
4469       The loop-entry argument is the vectorized initial-value of the reduction.
4470       The loop-latch argument is taken from VECT_DEFS - the vector of partial
4471       sums.
4472    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4473       by calling the function specified by REDUC_FN if available, or by
4474       other means (whole-vector shifts or a scalar loop).
4475       The function also creates a new phi node at the loop exit to preserve
4476       loop-closed form, as illustrated below.
4477
4478      The flow at the entry to this function:
4479
4480         loop:
4481           vec_def = phi <null, null>            # REDUCTION_PHI
4482           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
4483           s_loop = scalar_stmt                  # (scalar) STMT_INFO
4484         loop_exit:
4485           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4486           use <s_out0>
4487           use <s_out0>
4488
4489      The above is transformed by this function into:
4490
4491         loop:
4492           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4493           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
4494           s_loop = scalar_stmt                  # (scalar) STMT_INFO
4495         loop_exit:
4496           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4497           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4498           v_out2 = reduce <v_out1>
4499           s_out3 = extract_field <v_out2, 0>
4500           s_out4 = adjust_result <s_out3>
4501           use <s_out4>
4502           use <s_out4>
4503 */
4504
4505 static void
4506 vect_create_epilog_for_reduction (vec<tree> vect_defs,
4507                                   stmt_vec_info stmt_info,
4508                                   gimple *reduc_def_stmt,
4509                                   int ncopies, internal_fn reduc_fn,
4510                                   vec<stmt_vec_info> reduction_phis,
4511                                   bool double_reduc,
4512                                   slp_tree slp_node,
4513                                   slp_instance slp_node_instance,
4514                                   tree induc_val, enum tree_code induc_code,
4515                                   tree neutral_op)
4516 {
4517   stmt_vec_info prev_phi_info;
4518   tree vectype;
4519   machine_mode mode;
4520   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4521   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4522   basic_block exit_bb;
4523   tree scalar_dest;
4524   tree scalar_type;
4525   gimple *new_phi = NULL, *phi;
4526   stmt_vec_info phi_info;
4527   gimple_stmt_iterator exit_gsi;
4528   tree vec_dest;
4529   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4530   gimple *epilog_stmt = NULL;
4531   enum tree_code code = gimple_assign_rhs_code (stmt_info->stmt);
4532   gimple *exit_phi;
4533   tree bitsize;
4534   tree adjustment_def = NULL;
4535   tree vec_initial_def = NULL;
4536   tree expr, def, initial_def = NULL;
4537   tree orig_name, scalar_result;
4538   imm_use_iterator imm_iter, phi_imm_iter;
4539   use_operand_p use_p, phi_use_p;
4540   gimple *use_stmt;
4541   stmt_vec_info reduction_phi_info = NULL;
4542   bool nested_in_vect_loop = false;
4543   auto_vec<gimple *> new_phis;
4544   auto_vec<stmt_vec_info> inner_phis;
4545   int j, i;
4546   auto_vec<tree> scalar_results;
4547   unsigned int group_size = 1, k, ratio;
4548   auto_vec<tree> vec_initial_defs;
4549   auto_vec<gimple *> phis;
4550   bool slp_reduc = false;
4551   bool direct_slp_reduc;
4552   tree new_phi_result;
4553   stmt_vec_info inner_phi = NULL;
4554   tree induction_index = NULL_TREE;
4555
4556   if (slp_node)
4557     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4558
4559   if (nested_in_vect_loop_p (loop, stmt_info))
4560     {
4561       outer_loop = loop;
4562       loop = loop->inner;
4563       nested_in_vect_loop = true;
4564       gcc_assert (!slp_node);
4565     }
4566
4567   vectype = STMT_VINFO_VECTYPE (stmt_info);
4568   gcc_assert (vectype);
4569   mode = TYPE_MODE (vectype);
4570
4571   /* 1. Create the reduction def-use cycle:
4572      Set the arguments of REDUCTION_PHIS, i.e., transform
4573
4574         loop:
4575           vec_def = phi <null, null>            # REDUCTION_PHI
4576           VECT_DEF = vector_stmt                # vectorized form of STMT
4577           ...
4578
4579      into:
4580
4581         loop:
4582           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4583           VECT_DEF = vector_stmt                # vectorized form of STMT
4584           ...
4585
4586      (in case of SLP, do it for all the phis). */
4587
4588   /* Get the loop-entry arguments.  */
4589   enum vect_def_type initial_def_dt = vect_unknown_def_type;
4590   if (slp_node)
4591     {
4592       unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4593       vec_initial_defs.reserve (vec_num);
4594       get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
4595                                       &vec_initial_defs, vec_num,
4596                                       REDUC_GROUP_FIRST_ELEMENT (stmt_info),
4597                                       neutral_op);
4598     }
4599   else
4600     {
4601       /* Get at the scalar def before the loop, that defines the initial value
4602          of the reduction variable.  */
4603       initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4604                                            loop_preheader_edge (loop));
4605       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
4606          and we can't use zero for induc_val, use initial_def.  Similarly
4607          for REDUC_MIN and initial_def larger than the base.  */
4608       if (TREE_CODE (initial_def) == INTEGER_CST
4609           && (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4610               == INTEGER_INDUC_COND_REDUCTION)
4611           && !integer_zerop (induc_val)
4612           && ((induc_code == MAX_EXPR
4613                && tree_int_cst_lt (initial_def, induc_val))
4614               || (induc_code == MIN_EXPR
4615                   && tree_int_cst_lt (induc_val, initial_def))))
4616         induc_val = initial_def;
4617
4618       if (double_reduc)
4619         /* In case of double reduction we only create a vector variable
4620            to be put in the reduction phi node.  The actual statement
4621            creation is done later in this function.  */
4622         vec_initial_def = vect_create_destination_var (initial_def, vectype);
4623       else if (nested_in_vect_loop)
4624         {
4625           /* Do not use an adjustment def as that case is not supported
4626              correctly if ncopies is not one.  */
4627           vect_is_simple_use (initial_def, loop_vinfo, &initial_def_dt);
4628           vec_initial_def = vect_get_vec_def_for_operand (initial_def,
4629                                                           stmt_info);
4630         }
4631       else
4632         vec_initial_def
4633           = get_initial_def_for_reduction (stmt_info, initial_def,
4634                                            &adjustment_def);
4635       vec_initial_defs.create (1);
4636       vec_initial_defs.quick_push (vec_initial_def);
4637     }
4638
4639   /* Set phi nodes arguments.  */
4640   FOR_EACH_VEC_ELT (reduction_phis, i, phi_info)
4641     {
4642       tree vec_init_def = vec_initial_defs[i];
4643       tree def = vect_defs[i];
4644       for (j = 0; j < ncopies; j++)
4645         {
4646           if (j != 0)
4647             {
4648               phi_info = STMT_VINFO_RELATED_STMT (phi_info);
4649               if (nested_in_vect_loop)
4650                 vec_init_def
4651                   = vect_get_vec_def_for_stmt_copy (loop_vinfo, vec_init_def);
4652             }
4653
4654           /* Set the loop-entry arg of the reduction-phi.  */
4655
4656           gphi *phi = as_a <gphi *> (phi_info->stmt);
4657           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4658               == INTEGER_INDUC_COND_REDUCTION)
4659             {
4660               /* Initialise the reduction phi to zero.  This prevents initial
4661                  values of non-zero interferring with the reduction op.  */
4662               gcc_assert (ncopies == 1);
4663               gcc_assert (i == 0);
4664
4665               tree vec_init_def_type = TREE_TYPE (vec_init_def);
4666               tree induc_val_vec
4667                 = build_vector_from_val (vec_init_def_type, induc_val);
4668
4669               add_phi_arg (phi, induc_val_vec, loop_preheader_edge (loop),
4670                            UNKNOWN_LOCATION);
4671             }
4672           else
4673             add_phi_arg (phi, vec_init_def, loop_preheader_edge (loop),
4674                          UNKNOWN_LOCATION);
4675
4676           /* Set the loop-latch arg for the reduction-phi.  */
4677           if (j > 0)
4678             def = vect_get_vec_def_for_stmt_copy (loop_vinfo, def);
4679
4680           add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
4681
4682           if (dump_enabled_p ())
4683             dump_printf_loc (MSG_NOTE, vect_location,
4684                              "transform reduction: created def-use cycle: %G%G",
4685                              phi, SSA_NAME_DEF_STMT (def));
4686         }
4687     }
4688
4689   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4690      which is updated with the current index of the loop for every match of
4691      the original loop's cond_expr (VEC_STMT).  This results in a vector
4692      containing the last time the condition passed for that vector lane.
4693      The first match will be a 1 to allow 0 to be used for non-matching
4694      indexes.  If there are no matches at all then the vector will be all
4695      zeroes.  */
4696   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4697     {
4698       tree indx_before_incr, indx_after_incr;
4699       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4700
4701       gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info)->stmt;
4702       gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4703
4704       int scalar_precision
4705         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4706       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4707       tree cr_index_vector_type = build_vector_type
4708         (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype));
4709
4710       /* First we create a simple vector induction variable which starts
4711          with the values {1,2,3,...} (SERIES_VECT) and increments by the
4712          vector size (STEP).  */
4713
4714       /* Create a {1,2,3,...} vector.  */
4715       tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
4716
4717       /* Create a vector of the step value.  */
4718       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4719       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4720
4721       /* Create an induction variable.  */
4722       gimple_stmt_iterator incr_gsi;
4723       bool insert_after;
4724       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4725       create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4726                  insert_after, &indx_before_incr, &indx_after_incr);
4727
4728       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4729          filled with zeros (VEC_ZERO).  */
4730
4731       /* Create a vector of 0s.  */
4732       tree zero = build_zero_cst (cr_index_scalar_type);
4733       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4734
4735       /* Create a vector phi node.  */
4736       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4737       new_phi = create_phi_node (new_phi_tree, loop->header);
4738       loop_vinfo->add_stmt (new_phi);
4739       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4740                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
4741
4742       /* Now take the condition from the loops original cond_expr
4743          (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
4744          every match uses values from the induction variable
4745          (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4746          (NEW_PHI_TREE).
4747          Finally, we update the phi (NEW_PHI_TREE) to take the value of
4748          the new cond_expr (INDEX_COND_EXPR).  */
4749
4750       /* Duplicate the condition from vec_stmt.  */
4751       tree ccompare = unshare_expr (gimple_assign_rhs1 (vec_stmt));
4752
4753       /* Create a conditional, where the condition is taken from vec_stmt
4754          (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and
4755          else is the phi (NEW_PHI_TREE).  */
4756       tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
4757                                      ccompare, indx_before_incr,
4758                                      new_phi_tree);
4759       induction_index = make_ssa_name (cr_index_vector_type);
4760       gimple *index_condition = gimple_build_assign (induction_index,
4761                                                      index_cond_expr);
4762       gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
4763       stmt_vec_info index_vec_info = loop_vinfo->add_stmt (index_condition);
4764       STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
4765
4766       /* Update the phi with the vec cond.  */
4767       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4768                    loop_latch_edge (loop), UNKNOWN_LOCATION);
4769     }
4770
4771   /* 2. Create epilog code.
4772         The reduction epilog code operates across the elements of the vector
4773         of partial results computed by the vectorized loop.
4774         The reduction epilog code consists of:
4775
4776         step 1: compute the scalar result in a vector (v_out2)
4777         step 2: extract the scalar result (s_out3) from the vector (v_out2)
4778         step 3: adjust the scalar result (s_out3) if needed.
4779
4780         Step 1 can be accomplished using one the following three schemes:
4781           (scheme 1) using reduc_fn, if available.
4782           (scheme 2) using whole-vector shifts, if available.
4783           (scheme 3) using a scalar loop. In this case steps 1+2 above are
4784                      combined.
4785
4786           The overall epilog code looks like this:
4787
4788           s_out0 = phi <s_loop>         # original EXIT_PHI
4789           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
4790           v_out2 = reduce <v_out1>              # step 1
4791           s_out3 = extract_field <v_out2, 0>    # step 2
4792           s_out4 = adjust_result <s_out3>       # step 3
4793
4794           (step 3 is optional, and steps 1 and 2 may be combined).
4795           Lastly, the uses of s_out0 are replaced by s_out4.  */
4796
4797
4798   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4799          v_out1 = phi <VECT_DEF>
4800          Store them in NEW_PHIS.  */
4801
4802   exit_bb = single_exit (loop)->dest;
4803   prev_phi_info = NULL;
4804   new_phis.create (vect_defs.length ());
4805   FOR_EACH_VEC_ELT (vect_defs, i, def)
4806     {
4807       for (j = 0; j < ncopies; j++)
4808         {
4809           tree new_def = copy_ssa_name (def);
4810           phi = create_phi_node (new_def, exit_bb);
4811           stmt_vec_info phi_info = loop_vinfo->add_stmt (phi);
4812           if (j == 0)
4813             new_phis.quick_push (phi);
4814           else
4815             {
4816               def = vect_get_vec_def_for_stmt_copy (loop_vinfo, def);
4817               STMT_VINFO_RELATED_STMT (prev_phi_info) = phi_info;
4818             }
4819
4820           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4821           prev_phi_info = phi_info;
4822         }
4823     }
4824
4825   /* The epilogue is created for the outer-loop, i.e., for the loop being
4826      vectorized.  Create exit phis for the outer loop.  */
4827   if (double_reduc)
4828     {
4829       loop = outer_loop;
4830       exit_bb = single_exit (loop)->dest;
4831       inner_phis.create (vect_defs.length ());
4832       FOR_EACH_VEC_ELT (new_phis, i, phi)
4833         {
4834           stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
4835           tree new_result = copy_ssa_name (PHI_RESULT (phi));
4836           gphi *outer_phi = create_phi_node (new_result, exit_bb);
4837           SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4838                            PHI_RESULT (phi));
4839           prev_phi_info = loop_vinfo->add_stmt (outer_phi);
4840           inner_phis.quick_push (phi_info);
4841           new_phis[i] = outer_phi;
4842           while (STMT_VINFO_RELATED_STMT (phi_info))
4843             {
4844               phi_info = STMT_VINFO_RELATED_STMT (phi_info);
4845               new_result = copy_ssa_name (PHI_RESULT (phi_info->stmt));
4846               outer_phi = create_phi_node (new_result, exit_bb);
4847               SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4848                                PHI_RESULT (phi_info->stmt));
4849               stmt_vec_info outer_phi_info = loop_vinfo->add_stmt (outer_phi);
4850               STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi_info;
4851               prev_phi_info = outer_phi_info;
4852             }
4853         }
4854     }
4855
4856   exit_gsi = gsi_after_labels (exit_bb);
4857
4858   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4859          (i.e. when reduc_fn is not available) and in the final adjustment
4860          code (if needed).  Also get the original scalar reduction variable as
4861          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
4862          represents a reduction pattern), the tree-code and scalar-def are
4863          taken from the original stmt that the pattern-stmt (STMT) replaces.
4864          Otherwise (it is a regular reduction) - the tree-code and scalar-def
4865          are taken from STMT.  */
4866
4867   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4868   if (orig_stmt_info != stmt_info)
4869     {
4870       /* Reduction pattern  */
4871       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4872       gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
4873     }
4874
4875   code = gimple_assign_rhs_code (orig_stmt_info->stmt);
4876   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4877      partial results are added and not subtracted.  */
4878   if (code == MINUS_EXPR)
4879     code = PLUS_EXPR;
4880
4881   scalar_dest = gimple_assign_lhs (orig_stmt_info->stmt);
4882   scalar_type = TREE_TYPE (scalar_dest);
4883   scalar_results.create (group_size);
4884   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4885   bitsize = TYPE_SIZE (scalar_type);
4886
4887   /* In case this is a reduction in an inner-loop while vectorizing an outer
4888      loop - we don't need to extract a single scalar result at the end of the
4889      inner-loop (unless it is double reduction, i.e., the use of reduction is
4890      outside the outer-loop).  The final vector of partial results will be used
4891      in the vectorized outer-loop, or reduced to a scalar result at the end of
4892      the outer-loop.  */
4893   if (nested_in_vect_loop && !double_reduc)
4894     goto vect_finalize_reduction;
4895
4896   /* SLP reduction without reduction chain, e.g.,
4897      # a1 = phi <a2, a0>
4898      # b1 = phi <b2, b0>
4899      a2 = operation (a1)
4900      b2 = operation (b1)  */
4901   slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
4902
4903   /* True if we should implement SLP_REDUC using native reduction operations
4904      instead of scalar operations.  */
4905   direct_slp_reduc = (reduc_fn != IFN_LAST
4906                       && slp_reduc
4907                       && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
4908
4909   /* In case of reduction chain, e.g.,
4910      # a1 = phi <a3, a0>
4911      a2 = operation (a1)
4912      a3 = operation (a2),
4913
4914      we may end up with more than one vector result.  Here we reduce them to
4915      one vector.  */
4916   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) || direct_slp_reduc)
4917     {
4918       tree first_vect = PHI_RESULT (new_phis[0]);
4919       gassign *new_vec_stmt = NULL;
4920       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4921       for (k = 1; k < new_phis.length (); k++)
4922         {
4923           gimple *next_phi = new_phis[k];
4924           tree second_vect = PHI_RESULT (next_phi);
4925           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4926           new_vec_stmt = gimple_build_assign (tem, code,
4927                                               first_vect, second_vect);
4928           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4929           first_vect = tem;
4930         }
4931
4932       new_phi_result = first_vect;
4933       if (new_vec_stmt)
4934         {
4935           new_phis.truncate (0);
4936           new_phis.safe_push (new_vec_stmt);
4937         }
4938     }
4939   /* Likewise if we couldn't use a single defuse cycle.  */
4940   else if (ncopies > 1)
4941     {
4942       gcc_assert (new_phis.length () == 1);
4943       tree first_vect = PHI_RESULT (new_phis[0]);
4944       gassign *new_vec_stmt = NULL;
4945       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4946       stmt_vec_info next_phi_info = loop_vinfo->lookup_stmt (new_phis[0]);
4947       for (int k = 1; k < ncopies; ++k)
4948         {
4949           next_phi_info = STMT_VINFO_RELATED_STMT (next_phi_info);
4950           tree second_vect = PHI_RESULT (next_phi_info->stmt);
4951           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4952           new_vec_stmt = gimple_build_assign (tem, code,
4953                                               first_vect, second_vect);
4954           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4955           first_vect = tem;
4956         }
4957       new_phi_result = first_vect;
4958       new_phis.truncate (0);
4959       new_phis.safe_push (new_vec_stmt);
4960     }
4961   else
4962     new_phi_result = PHI_RESULT (new_phis[0]);
4963
4964   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4965       && reduc_fn != IFN_LAST)
4966     {
4967       /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4968          various data values where the condition matched and another vector
4969          (INDUCTION_INDEX) containing all the indexes of those matches.  We
4970          need to extract the last matching index (which will be the index with
4971          highest value) and use this to index into the data vector.
4972          For the case where there were no matches, the data vector will contain
4973          all default values and the index vector will be all zeros.  */
4974
4975       /* Get various versions of the type of the vector of indexes.  */
4976       tree index_vec_type = TREE_TYPE (induction_index);
4977       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4978       tree index_scalar_type = TREE_TYPE (index_vec_type);
4979       tree index_vec_cmp_type = build_same_sized_truth_vector_type
4980         (index_vec_type);
4981
4982       /* Get an unsigned integer version of the type of the data vector.  */
4983       int scalar_precision
4984         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
4985       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4986       tree vectype_unsigned = build_vector_type
4987         (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4988
4989       /* First we need to create a vector (ZERO_VEC) of zeros and another
4990          vector (MAX_INDEX_VEC) filled with the last matching index, which we
4991          can create using a MAX reduction and then expanding.
4992          In the case where the loop never made any matches, the max index will
4993          be zero.  */
4994
4995       /* Vector of {0, 0, 0,...}.  */
4996       tree zero_vec = make_ssa_name (vectype);
4997       tree zero_vec_rhs = build_zero_cst (vectype);
4998       gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
4999       gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
5000
5001       /* Find maximum value from the vector of found indexes.  */
5002       tree max_index = make_ssa_name (index_scalar_type);
5003       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5004                                                           1, induction_index);
5005       gimple_call_set_lhs (max_index_stmt, max_index);
5006       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
5007
5008       /* Vector of {max_index, max_index, max_index,...}.  */
5009       tree max_index_vec = make_ssa_name (index_vec_type);
5010       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
5011                                                       max_index);
5012       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
5013                                                         max_index_vec_rhs);
5014       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
5015
5016       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
5017          with the vector (INDUCTION_INDEX) of found indexes, choosing values
5018          from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
5019          otherwise.  Only one value should match, resulting in a vector
5020          (VEC_COND) with one data value and the rest zeros.
5021          In the case where the loop never made any matches, every index will
5022          match, resulting in a vector with all data values (which will all be
5023          the default value).  */
5024
5025       /* Compare the max index vector to the vector of found indexes to find
5026          the position of the max value.  */
5027       tree vec_compare = make_ssa_name (index_vec_cmp_type);
5028       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
5029                                                       induction_index,
5030                                                       max_index_vec);
5031       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
5032
5033       /* Use the compare to choose either values from the data vector or
5034          zero.  */
5035       tree vec_cond = make_ssa_name (vectype);
5036       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
5037                                                    vec_compare, new_phi_result,
5038                                                    zero_vec);
5039       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
5040
5041       /* Finally we need to extract the data value from the vector (VEC_COND)
5042          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
5043          reduction, but because this doesn't exist, we can use a MAX reduction
5044          instead.  The data value might be signed or a float so we need to cast
5045          it first.
5046          In the case where the loop never made any matches, the data values are
5047          all identical, and so will reduce down correctly.  */
5048
5049       /* Make the matched data values unsigned.  */
5050       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
5051       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
5052                                        vec_cond);
5053       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
5054                                                         VIEW_CONVERT_EXPR,
5055                                                         vec_cond_cast_rhs);
5056       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
5057
5058       /* Reduce down to a scalar value.  */
5059       tree data_reduc = make_ssa_name (scalar_type_unsigned);
5060       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5061                                                            1, vec_cond_cast);
5062       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
5063       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
5064
5065       /* Convert the reduced value back to the result type and set as the
5066          result.  */
5067       gimple_seq stmts = NULL;
5068       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
5069                                data_reduc);
5070       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5071       scalar_results.safe_push (new_temp);
5072     }
5073   else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
5074            && reduc_fn == IFN_LAST)
5075     {
5076       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
5077          idx = 0;
5078          idx_val = induction_index[0];
5079          val = data_reduc[0];
5080          for (idx = 0, val = init, i = 0; i < nelts; ++i)
5081            if (induction_index[i] > idx_val)
5082              val = data_reduc[i], idx_val = induction_index[i];
5083          return val;  */
5084
5085       tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
5086       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5087       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5088       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5089       /* Enforced by vectorizable_reduction, which ensures we have target
5090          support before allowing a conditional reduction on variable-length
5091          vectors.  */
5092       unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5093       tree idx_val = NULL_TREE, val = NULL_TREE;
5094       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5095         {
5096           tree old_idx_val = idx_val;
5097           tree old_val = val;
5098           idx_val = make_ssa_name (idx_eltype);
5099           epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5100                                              build3 (BIT_FIELD_REF, idx_eltype,
5101                                                      induction_index,
5102                                                      bitsize_int (el_size),
5103                                                      bitsize_int (off)));
5104           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5105           val = make_ssa_name (data_eltype);
5106           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5107                                              build3 (BIT_FIELD_REF,
5108                                                      data_eltype,
5109                                                      new_phi_result,
5110                                                      bitsize_int (el_size),
5111                                                      bitsize_int (off)));
5112           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5113           if (off != 0)
5114             {
5115               tree new_idx_val = idx_val;
5116               tree new_val = val;
5117               if (off != v_size - el_size)
5118                 {
5119                   new_idx_val = make_ssa_name (idx_eltype);
5120                   epilog_stmt = gimple_build_assign (new_idx_val,
5121                                                      MAX_EXPR, idx_val,
5122                                                      old_idx_val);
5123                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5124                 }
5125               new_val = make_ssa_name (data_eltype);
5126               epilog_stmt = gimple_build_assign (new_val,
5127                                                  COND_EXPR,
5128                                                  build2 (GT_EXPR,
5129                                                          boolean_type_node,
5130                                                          idx_val,
5131                                                          old_idx_val),
5132                                                  val, old_val);
5133               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5134               idx_val = new_idx_val;
5135               val = new_val;
5136             }
5137         }
5138       /* Convert the reduced value back to the result type and set as the
5139          result.  */
5140       gimple_seq stmts = NULL;
5141       val = gimple_convert (&stmts, scalar_type, val);
5142       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5143       scalar_results.safe_push (val);
5144     }
5145
5146   /* 2.3 Create the reduction code, using one of the three schemes described
5147          above. In SLP we simply need to extract all the elements from the
5148          vector (without reducing them), so we use scalar shifts.  */
5149   else if (reduc_fn != IFN_LAST && !slp_reduc)
5150     {
5151       tree tmp;
5152       tree vec_elem_type;
5153
5154       /* Case 1:  Create:
5155          v_out2 = reduc_expr <v_out1>  */
5156
5157       if (dump_enabled_p ())
5158         dump_printf_loc (MSG_NOTE, vect_location,
5159                          "Reduce using direct vector reduction.\n");
5160
5161       vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
5162       if (!useless_type_conversion_p (scalar_type, vec_elem_type))
5163         {
5164           tree tmp_dest
5165             = vect_create_destination_var (scalar_dest, vec_elem_type);
5166           epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5167                                                     new_phi_result);
5168           gimple_set_lhs (epilog_stmt, tmp_dest);
5169           new_temp = make_ssa_name (tmp_dest, epilog_stmt);
5170           gimple_set_lhs (epilog_stmt, new_temp);
5171           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5172
5173           epilog_stmt = gimple_build_assign (new_scalar_dest, NOP_EXPR,
5174                                              new_temp);
5175         }
5176       else
5177         {
5178           epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5179                                                     new_phi_result);
5180           gimple_set_lhs (epilog_stmt, new_scalar_dest);
5181         }
5182
5183       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5184       gimple_set_lhs (epilog_stmt, new_temp);
5185       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5186
5187       if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5188            == INTEGER_INDUC_COND_REDUCTION)
5189           && !operand_equal_p (initial_def, induc_val, 0))
5190         {
5191           /* Earlier we set the initial value to be a vector if induc_val
5192              values.  Check the result and if it is induc_val then replace
5193              with the original initial value, unless induc_val is
5194              the same as initial_def already.  */
5195           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5196                                   induc_val);
5197
5198           tmp = make_ssa_name (new_scalar_dest);
5199           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5200                                              initial_def, new_temp);
5201           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5202           new_temp = tmp;
5203         }
5204
5205       scalar_results.safe_push (new_temp);
5206     }
5207   else if (direct_slp_reduc)
5208     {
5209       /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5210          with the elements for other SLP statements replaced with the
5211          neutral value.  We can then do a normal reduction on each vector.  */
5212
5213       /* Enforced by vectorizable_reduction.  */
5214       gcc_assert (new_phis.length () == 1);
5215       gcc_assert (pow2p_hwi (group_size));
5216
5217       slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
5218       vec<stmt_vec_info> orig_phis
5219         = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
5220       gimple_seq seq = NULL;
5221
5222       /* Build a vector {0, 1, 2, ...}, with the same number of elements
5223          and the same element size as VECTYPE.  */
5224       tree index = build_index_vector (vectype, 0, 1);
5225       tree index_type = TREE_TYPE (index);
5226       tree index_elt_type = TREE_TYPE (index_type);
5227       tree mask_type = build_same_sized_truth_vector_type (index_type);
5228
5229       /* Create a vector that, for each element, identifies which of
5230          the REDUC_GROUP_SIZE results should use it.  */
5231       tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5232       index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5233                             build_vector_from_val (index_type, index_mask));
5234
5235       /* Get a neutral vector value.  This is simply a splat of the neutral
5236          scalar value if we have one, otherwise the initial scalar value
5237          is itself a neutral value.  */
5238       tree vector_identity = NULL_TREE;
5239       if (neutral_op)
5240         vector_identity = gimple_build_vector_from_val (&seq, vectype,
5241                                                         neutral_op);
5242       for (unsigned int i = 0; i < group_size; ++i)
5243         {
5244           /* If there's no univeral neutral value, we can use the
5245              initial scalar value from the original PHI.  This is used
5246              for MIN and MAX reduction, for example.  */
5247           if (!neutral_op)
5248             {
5249               tree scalar_value
5250                 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i]->stmt,
5251                                          loop_preheader_edge (loop));
5252               vector_identity = gimple_build_vector_from_val (&seq, vectype,
5253                                                               scalar_value);
5254             }
5255
5256           /* Calculate the equivalent of:
5257
5258              sel[j] = (index[j] == i);
5259
5260              which selects the elements of NEW_PHI_RESULT that should
5261              be included in the result.  */
5262           tree compare_val = build_int_cst (index_elt_type, i);
5263           compare_val = build_vector_from_val (index_type, compare_val);
5264           tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5265                                    index, compare_val);
5266
5267           /* Calculate the equivalent of:
5268
5269              vec = seq ? new_phi_result : vector_identity;
5270
5271              VEC is now suitable for a full vector reduction.  */
5272           tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5273                                    sel, new_phi_result, vector_identity);
5274
5275           /* Do the reduction and convert it to the appropriate type.  */
5276           tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5277                                       TREE_TYPE (vectype), vec);
5278           scalar = gimple_convert (&seq, scalar_type, scalar);
5279           scalar_results.safe_push (scalar);
5280         }
5281       gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5282     }
5283   else
5284     {
5285       bool reduce_with_shift;
5286       tree vec_temp;
5287
5288       /* COND reductions all do the final reduction with MAX_EXPR
5289          or MIN_EXPR.  */
5290       if (code == COND_EXPR)
5291         {
5292           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5293               == INTEGER_INDUC_COND_REDUCTION)
5294             code = induc_code;
5295           else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5296                    == CONST_COND_REDUCTION)
5297             code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
5298           else
5299             code = MAX_EXPR;
5300         }
5301
5302       /* See if the target wants to do the final (shift) reduction
5303          in a vector mode of smaller size and first reduce upper/lower
5304          halves against each other.  */
5305       enum machine_mode mode1 = mode;
5306       tree vectype1 = vectype;
5307       unsigned sz = tree_to_uhwi (TYPE_SIZE_UNIT (vectype));
5308       unsigned sz1 = sz;
5309       if (!slp_reduc
5310           && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5311         sz1 = GET_MODE_SIZE (mode1).to_constant ();
5312
5313       vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz1);
5314       reduce_with_shift = have_whole_vector_shift (mode1);
5315       if (!VECTOR_MODE_P (mode1))
5316         reduce_with_shift = false;
5317       else
5318         {
5319           optab optab = optab_for_tree_code (code, vectype1, optab_default);
5320           if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5321             reduce_with_shift = false;
5322         }
5323
5324       /* First reduce the vector to the desired vector size we should
5325          do shift reduction on by combining upper and lower halves.  */
5326       new_temp = new_phi_result;
5327       while (sz > sz1)
5328         {
5329           gcc_assert (!slp_reduc);
5330           sz /= 2;
5331           vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz);
5332
5333           /* The target has to make sure we support lowpart/highpart
5334              extraction, either via direct vector extract or through
5335              an integer mode punning.  */
5336           tree dst1, dst2;
5337           if (convert_optab_handler (vec_extract_optab,
5338                                      TYPE_MODE (TREE_TYPE (new_temp)),
5339                                      TYPE_MODE (vectype1))
5340               != CODE_FOR_nothing)
5341             {
5342               /* Extract sub-vectors directly once vec_extract becomes
5343                  a conversion optab.  */
5344               dst1 = make_ssa_name (vectype1);
5345               epilog_stmt
5346                   = gimple_build_assign (dst1, BIT_FIELD_REF,
5347                                          build3 (BIT_FIELD_REF, vectype1,
5348                                                  new_temp, TYPE_SIZE (vectype1),
5349                                                  bitsize_int (0)));
5350               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5351               dst2 =  make_ssa_name (vectype1);
5352               epilog_stmt
5353                   = gimple_build_assign (dst2, BIT_FIELD_REF,
5354                                          build3 (BIT_FIELD_REF, vectype1,
5355                                                  new_temp, TYPE_SIZE (vectype1),
5356                                                  bitsize_int (sz * BITS_PER_UNIT)));
5357               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5358             }
5359           else
5360             {
5361               /* Extract via punning to appropriately sized integer mode
5362                  vector.  */
5363               tree eltype = build_nonstandard_integer_type (sz * BITS_PER_UNIT,
5364                                                             1);
5365               tree etype = build_vector_type (eltype, 2);
5366               gcc_assert (convert_optab_handler (vec_extract_optab,
5367                                                  TYPE_MODE (etype),
5368                                                  TYPE_MODE (eltype))
5369                           != CODE_FOR_nothing);
5370               tree tem = make_ssa_name (etype);
5371               epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5372                                                  build1 (VIEW_CONVERT_EXPR,
5373                                                          etype, new_temp));
5374               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5375               new_temp = tem;
5376               tem = make_ssa_name (eltype);
5377               epilog_stmt
5378                   = gimple_build_assign (tem, BIT_FIELD_REF,
5379                                          build3 (BIT_FIELD_REF, eltype,
5380                                                  new_temp, TYPE_SIZE (eltype),
5381                                                  bitsize_int (0)));
5382               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5383               dst1 = make_ssa_name (vectype1);
5384               epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5385                                                  build1 (VIEW_CONVERT_EXPR,
5386                                                          vectype1, tem));
5387               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5388               tem = make_ssa_name (eltype);
5389               epilog_stmt
5390                   = gimple_build_assign (tem, BIT_FIELD_REF,
5391                                          build3 (BIT_FIELD_REF, eltype,
5392                                                  new_temp, TYPE_SIZE (eltype),
5393                                                  bitsize_int (sz * BITS_PER_UNIT)));
5394               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5395               dst2 =  make_ssa_name (vectype1);
5396               epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5397                                                  build1 (VIEW_CONVERT_EXPR,
5398                                                          vectype1, tem));
5399               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5400             }
5401
5402           new_temp = make_ssa_name (vectype1);
5403           epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5404           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5405         }
5406
5407       if (reduce_with_shift && !slp_reduc)
5408         {
5409           int element_bitsize = tree_to_uhwi (bitsize);
5410           /* Enforced by vectorizable_reduction, which disallows SLP reductions
5411              for variable-length vectors and also requires direct target support
5412              for loop reductions.  */
5413           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5414           int nelements = vec_size_in_bits / element_bitsize;
5415           vec_perm_builder sel;
5416           vec_perm_indices indices;
5417
5418           int elt_offset;
5419
5420           tree zero_vec = build_zero_cst (vectype1);
5421           /* Case 2: Create:
5422              for (offset = nelements/2; offset >= 1; offset/=2)
5423                 {
5424                   Create:  va' = vec_shift <va, offset>
5425                   Create:  va = vop <va, va'>
5426                 }  */
5427
5428           tree rhs;
5429
5430           if (dump_enabled_p ())
5431             dump_printf_loc (MSG_NOTE, vect_location,
5432                              "Reduce using vector shifts\n");
5433
5434           mode1 = TYPE_MODE (vectype1);
5435           vec_dest = vect_create_destination_var (scalar_dest, vectype1);
5436           for (elt_offset = nelements / 2;
5437                elt_offset >= 1;
5438                elt_offset /= 2)
5439             {
5440               calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5441               indices.new_vector (sel, 2, nelements);
5442               tree mask = vect_gen_perm_mask_any (vectype1, indices);
5443               epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
5444                                                  new_temp, zero_vec, mask);
5445               new_name = make_ssa_name (vec_dest, epilog_stmt);
5446               gimple_assign_set_lhs (epilog_stmt, new_name);
5447               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5448
5449               epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
5450                                                  new_temp);
5451               new_temp = make_ssa_name (vec_dest, epilog_stmt);
5452               gimple_assign_set_lhs (epilog_stmt, new_temp);
5453               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5454             }
5455
5456           /* 2.4  Extract the final scalar result.  Create:
5457              s_out3 = extract_field <v_out2, bitpos>  */
5458
5459           if (dump_enabled_p ())
5460             dump_printf_loc (MSG_NOTE, vect_location,
5461                              "extract scalar result\n");
5462
5463           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5464                         bitsize, bitsize_zero_node);
5465           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5466           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5467           gimple_assign_set_lhs (epilog_stmt, new_temp);
5468           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5469           scalar_results.safe_push (new_temp);
5470         }
5471       else
5472         {
5473           /* Case 3: Create:
5474              s = extract_field <v_out2, 0>
5475              for (offset = element_size;
5476                   offset < vector_size;
5477                   offset += element_size;)
5478                {
5479                  Create:  s' = extract_field <v_out2, offset>
5480                  Create:  s = op <s, s'>  // For non SLP cases
5481                }  */
5482
5483           if (dump_enabled_p ())
5484             dump_printf_loc (MSG_NOTE, vect_location,
5485                              "Reduce using scalar code.\n");
5486
5487           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5488           int element_bitsize = tree_to_uhwi (bitsize);
5489           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5490             {
5491               int bit_offset;
5492               if (gimple_code (new_phi) == GIMPLE_PHI)
5493                 vec_temp = PHI_RESULT (new_phi);
5494               else
5495                 vec_temp = gimple_assign_lhs (new_phi);
5496               tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
5497                                  bitsize_zero_node);
5498               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5499               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5500               gimple_assign_set_lhs (epilog_stmt, new_temp);
5501               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5502
5503               /* In SLP we don't need to apply reduction operation, so we just
5504                  collect s' values in SCALAR_RESULTS.  */
5505               if (slp_reduc)
5506                 scalar_results.safe_push (new_temp);
5507
5508               for (bit_offset = element_bitsize;
5509                    bit_offset < vec_size_in_bits;
5510                    bit_offset += element_bitsize)
5511                 {
5512                   tree bitpos = bitsize_int (bit_offset);
5513                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
5514                                      bitsize, bitpos);
5515
5516                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5517                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
5518                   gimple_assign_set_lhs (epilog_stmt, new_name);
5519                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5520
5521                   if (slp_reduc)
5522                     {
5523                       /* In SLP we don't need to apply reduction operation, so
5524                          we just collect s' values in SCALAR_RESULTS.  */
5525                       new_temp = new_name;
5526                       scalar_results.safe_push (new_name);
5527                     }
5528                   else
5529                     {
5530                       epilog_stmt = gimple_build_assign (new_scalar_dest, code,
5531                                                          new_name, new_temp);
5532                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5533                       gimple_assign_set_lhs (epilog_stmt, new_temp);
5534                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5535                     }
5536                 }
5537             }
5538
5539           /* The only case where we need to reduce scalar results in SLP, is
5540              unrolling.  If the size of SCALAR_RESULTS is greater than
5541              REDUC_GROUP_SIZE, we reduce them combining elements modulo
5542              REDUC_GROUP_SIZE.  */
5543           if (slp_reduc)
5544             {
5545               tree res, first_res, new_res;
5546               gimple *new_stmt;
5547
5548               /* Reduce multiple scalar results in case of SLP unrolling.  */
5549               for (j = group_size; scalar_results.iterate (j, &res);
5550                    j++)
5551                 {
5552                   first_res = scalar_results[j % group_size];
5553                   new_stmt = gimple_build_assign (new_scalar_dest, code,
5554                                                   first_res, res);
5555                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
5556                   gimple_assign_set_lhs (new_stmt, new_res);
5557                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
5558                   scalar_results[j % group_size] = new_res;
5559                 }
5560             }
5561           else
5562             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
5563             scalar_results.safe_push (new_temp);
5564         }
5565
5566       if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5567            == INTEGER_INDUC_COND_REDUCTION)
5568           && !operand_equal_p (initial_def, induc_val, 0))
5569         {
5570           /* Earlier we set the initial value to be a vector if induc_val
5571              values.  Check the result and if it is induc_val then replace
5572              with the original initial value, unless induc_val is
5573              the same as initial_def already.  */
5574           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5575                                   induc_val);
5576
5577           tree tmp = make_ssa_name (new_scalar_dest);
5578           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5579                                              initial_def, new_temp);
5580           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5581           scalar_results[0] = tmp;
5582         }
5583     }
5584
5585 vect_finalize_reduction:
5586
5587   if (double_reduc)
5588     loop = loop->inner;
5589
5590   /* 2.5 Adjust the final result by the initial value of the reduction
5591          variable. (When such adjustment is not needed, then
5592          'adjustment_def' is zero).  For example, if code is PLUS we create:
5593          new_temp = loop_exit_def + adjustment_def  */
5594
5595   if (adjustment_def)
5596     {
5597       gcc_assert (!slp_reduc);
5598       if (nested_in_vect_loop)
5599         {
5600           new_phi = new_phis[0];
5601           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
5602           expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
5603           new_dest = vect_create_destination_var (scalar_dest, vectype);
5604         }
5605       else
5606         {
5607           new_temp = scalar_results[0];
5608           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5609           expr = build2 (code, scalar_type, new_temp, adjustment_def);
5610           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
5611         }
5612
5613       epilog_stmt = gimple_build_assign (new_dest, expr);
5614       new_temp = make_ssa_name (new_dest, epilog_stmt);
5615       gimple_assign_set_lhs (epilog_stmt, new_temp);
5616       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5617       if (nested_in_vect_loop)
5618         {
5619           stmt_vec_info epilog_stmt_info = loop_vinfo->add_stmt (epilog_stmt);
5620           STMT_VINFO_RELATED_STMT (epilog_stmt_info)
5621             = STMT_VINFO_RELATED_STMT (loop_vinfo->lookup_stmt (new_phi));
5622
5623           if (!double_reduc)
5624             scalar_results.quick_push (new_temp);
5625           else
5626             scalar_results[0] = new_temp;
5627         }
5628       else
5629         scalar_results[0] = new_temp;
5630
5631       new_phis[0] = epilog_stmt;
5632     }
5633
5634   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
5635           phis with new adjusted scalar results, i.e., replace use <s_out0>
5636           with use <s_out4>.
5637
5638      Transform:
5639         loop_exit:
5640           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5641           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5642           v_out2 = reduce <v_out1>
5643           s_out3 = extract_field <v_out2, 0>
5644           s_out4 = adjust_result <s_out3>
5645           use <s_out0>
5646           use <s_out0>
5647
5648      into:
5649
5650         loop_exit:
5651           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5652           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5653           v_out2 = reduce <v_out1>
5654           s_out3 = extract_field <v_out2, 0>
5655           s_out4 = adjust_result <s_out3>
5656           use <s_out4>
5657           use <s_out4> */
5658
5659
5660   /* In SLP reduction chain we reduce vector results into one vector if
5661      necessary, hence we set here REDUC_GROUP_SIZE to 1.  SCALAR_DEST is the
5662      LHS of the last stmt in the reduction chain, since we are looking for
5663      the loop exit phi node.  */
5664   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5665     {
5666       stmt_vec_info dest_stmt_info
5667         = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
5668       scalar_dest = gimple_assign_lhs (dest_stmt_info->stmt);
5669       group_size = 1;
5670     }
5671
5672   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5673      case that REDUC_GROUP_SIZE is greater than vectorization factor).
5674      Therefore, we need to match SCALAR_RESULTS with corresponding statements.
5675      The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results
5676      correspond to the first vector stmt, etc.
5677      (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)).  */
5678   if (group_size > new_phis.length ())
5679     {
5680       ratio = group_size / new_phis.length ();
5681       gcc_assert (!(group_size % new_phis.length ()));
5682     }
5683   else
5684     ratio = 1;
5685
5686   stmt_vec_info epilog_stmt_info = NULL;
5687   for (k = 0; k < group_size; k++)
5688     {
5689       if (k % ratio == 0)
5690         {
5691           epilog_stmt_info = loop_vinfo->lookup_stmt (new_phis[k / ratio]);
5692           reduction_phi_info = reduction_phis[k / ratio];
5693           if (double_reduc)
5694             inner_phi = inner_phis[k / ratio];
5695         }
5696
5697       if (slp_reduc)
5698         {
5699           stmt_vec_info scalar_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5700
5701           orig_stmt_info = STMT_VINFO_RELATED_STMT (scalar_stmt_info);
5702           /* SLP statements can't participate in patterns.  */
5703           gcc_assert (!orig_stmt_info);
5704           scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt);
5705         }
5706
5707       phis.create (3);
5708       /* Find the loop-closed-use at the loop exit of the original scalar
5709          result.  (The reduction result is expected to have two immediate uses -
5710          one at the latch block, and one at the loop exit).  */
5711       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5712         if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
5713             && !is_gimple_debug (USE_STMT (use_p)))
5714           phis.safe_push (USE_STMT (use_p));
5715
5716       /* While we expect to have found an exit_phi because of loop-closed-ssa
5717          form we can end up without one if the scalar cycle is dead.  */
5718
5719       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5720         {
5721           if (outer_loop)
5722             {
5723               stmt_vec_info exit_phi_vinfo
5724                 = loop_vinfo->lookup_stmt (exit_phi);
5725               gphi *vect_phi;
5726
5727               if (double_reduc)
5728                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
5729               else
5730                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt_info;
5731               if (!double_reduc
5732                   || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
5733                       != vect_double_reduction_def)
5734                 continue;
5735
5736               /* Handle double reduction:
5737
5738                  stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
5739                  stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
5740                  stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
5741                  stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
5742
5743                  At that point the regular reduction (stmt2 and stmt3) is
5744                  already vectorized, as well as the exit phi node, stmt4.
5745                  Here we vectorize the phi node of double reduction, stmt1, and
5746                  update all relevant statements.  */
5747
5748               /* Go through all the uses of s2 to find double reduction phi
5749                  node, i.e., stmt1 above.  */
5750               orig_name = PHI_RESULT (exit_phi);
5751               FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5752                 {
5753                   stmt_vec_info use_stmt_vinfo;
5754                   tree vect_phi_init, preheader_arg, vect_phi_res;
5755                   basic_block bb = gimple_bb (use_stmt);
5756
5757                   /* Check that USE_STMT is really double reduction phi
5758                      node.  */
5759                   if (gimple_code (use_stmt) != GIMPLE_PHI
5760                       || gimple_phi_num_args (use_stmt) != 2
5761                       || bb->loop_father != outer_loop)
5762                     continue;
5763                   use_stmt_vinfo = loop_vinfo->lookup_stmt (use_stmt);
5764                   if (!use_stmt_vinfo
5765                       || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
5766                           != vect_double_reduction_def)
5767                     continue;
5768
5769                   /* Create vector phi node for double reduction:
5770                      vs1 = phi <vs0, vs2>
5771                      vs1 was created previously in this function by a call to
5772                        vect_get_vec_def_for_operand and is stored in
5773                        vec_initial_def;
5774                      vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
5775                      vs0 is created here.  */
5776
5777                   /* Create vector phi node.  */
5778                   vect_phi = create_phi_node (vec_initial_def, bb);
5779                   loop_vec_info_for_loop (outer_loop)->add_stmt (vect_phi);
5780
5781                   /* Create vs0 - initial def of the double reduction phi.  */
5782                   preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
5783                                              loop_preheader_edge (outer_loop));
5784                   vect_phi_init = get_initial_def_for_reduction
5785                     (stmt_info, preheader_arg, NULL);
5786
5787                   /* Update phi node arguments with vs0 and vs2.  */
5788                   add_phi_arg (vect_phi, vect_phi_init,
5789                                loop_preheader_edge (outer_loop),
5790                                UNKNOWN_LOCATION);
5791                   add_phi_arg (vect_phi, PHI_RESULT (inner_phi->stmt),
5792                                loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
5793                   if (dump_enabled_p ())
5794                     dump_printf_loc (MSG_NOTE, vect_location,
5795                                      "created double reduction phi node: %G",
5796                                      vect_phi);
5797
5798                   vect_phi_res = PHI_RESULT (vect_phi);
5799
5800                   /* Replace the use, i.e., set the correct vs1 in the regular
5801                      reduction phi node.  FORNOW, NCOPIES is always 1, so the
5802                      loop is redundant.  */
5803                   stmt_vec_info use_info = reduction_phi_info;
5804                   for (j = 0; j < ncopies; j++)
5805                     {
5806                       edge pr_edge = loop_preheader_edge (loop);
5807                       SET_PHI_ARG_DEF (as_a <gphi *> (use_info->stmt),
5808                                        pr_edge->dest_idx, vect_phi_res);
5809                       use_info = STMT_VINFO_RELATED_STMT (use_info);
5810                     }
5811                 }
5812             }
5813         }
5814
5815       phis.release ();
5816       if (nested_in_vect_loop)
5817         {
5818           if (double_reduc)
5819             loop = outer_loop;
5820           else
5821             continue;
5822         }
5823
5824       phis.create (3);
5825       /* Find the loop-closed-use at the loop exit of the original scalar
5826          result.  (The reduction result is expected to have two immediate uses,
5827          one at the latch block, and one at the loop exit).  For double
5828          reductions we are looking for exit phis of the outer loop.  */
5829       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5830         {
5831           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5832             {
5833               if (!is_gimple_debug (USE_STMT (use_p)))
5834                 phis.safe_push (USE_STMT (use_p));
5835             }
5836           else
5837             {
5838               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5839                 {
5840                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
5841
5842                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5843                     {
5844                       if (!flow_bb_inside_loop_p (loop,
5845                                              gimple_bb (USE_STMT (phi_use_p)))
5846                           && !is_gimple_debug (USE_STMT (phi_use_p)))
5847                         phis.safe_push (USE_STMT (phi_use_p));
5848                     }
5849                 }
5850             }
5851         }
5852
5853       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5854         {
5855           /* Replace the uses:  */
5856           orig_name = PHI_RESULT (exit_phi);
5857           scalar_result = scalar_results[k];
5858           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5859             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5860               SET_USE (use_p, scalar_result);
5861         }
5862
5863       phis.release ();
5864     }
5865 }
5866
5867 /* Return a vector of type VECTYPE that is equal to the vector select
5868    operation "MASK ? VEC : IDENTITY".  Insert the select statements
5869    before GSI.  */
5870
5871 static tree
5872 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
5873                      tree vec, tree identity)
5874 {
5875   tree cond = make_temp_ssa_name (vectype, NULL, "cond");
5876   gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
5877                                           mask, vec, identity);
5878   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5879   return cond;
5880 }
5881
5882 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
5883    order, starting with LHS.  Insert the extraction statements before GSI and
5884    associate the new scalar SSA names with variable SCALAR_DEST.
5885    Return the SSA name for the result.  */
5886
5887 static tree
5888 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
5889                        tree_code code, tree lhs, tree vector_rhs)
5890 {
5891   tree vectype = TREE_TYPE (vector_rhs);
5892   tree scalar_type = TREE_TYPE (vectype);
5893   tree bitsize = TYPE_SIZE (scalar_type);
5894   unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5895   unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
5896
5897   for (unsigned HOST_WIDE_INT bit_offset = 0;
5898        bit_offset < vec_size_in_bits;
5899        bit_offset += element_bitsize)
5900     {
5901       tree bitpos = bitsize_int (bit_offset);
5902       tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
5903                          bitsize, bitpos);
5904
5905       gassign *stmt = gimple_build_assign (scalar_dest, rhs);
5906       rhs = make_ssa_name (scalar_dest, stmt);
5907       gimple_assign_set_lhs (stmt, rhs);
5908       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5909
5910       stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
5911       tree new_name = make_ssa_name (scalar_dest, stmt);
5912       gimple_assign_set_lhs (stmt, new_name);
5913       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5914       lhs = new_name;
5915     }
5916   return lhs;
5917 }
5918
5919 /* Get a masked internal function equivalent to REDUC_FN.  VECTYPE_IN is the
5920    type of the vector input.  */
5921
5922 static internal_fn
5923 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
5924 {
5925   internal_fn mask_reduc_fn;
5926
5927   switch (reduc_fn)
5928     {
5929     case IFN_FOLD_LEFT_PLUS:
5930       mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
5931       break;
5932
5933     default:
5934       return IFN_LAST;
5935     }
5936
5937   if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
5938                                       OPTIMIZE_FOR_SPEED))
5939     return mask_reduc_fn;
5940   return IFN_LAST;
5941 }
5942
5943 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT_INFO is the
5944    statement that sets the live-out value.  REDUC_DEF_STMT is the phi
5945    statement.  CODE is the operation performed by STMT_INFO and OPS are
5946    its scalar operands.  REDUC_INDEX is the index of the operand in
5947    OPS that is set by REDUC_DEF_STMT.  REDUC_FN is the function that
5948    implements in-order reduction, or IFN_LAST if we should open-code it.
5949    VECTYPE_IN is the type of the vector input.  MASKS specifies the masks
5950    that should be used to control the operation in a fully-masked loop.  */
5951
5952 static bool
5953 vectorize_fold_left_reduction (stmt_vec_info stmt_info,
5954                                gimple_stmt_iterator *gsi,
5955                                stmt_vec_info *vec_stmt, slp_tree slp_node,
5956                                gimple *reduc_def_stmt,
5957                                tree_code code, internal_fn reduc_fn,
5958                                tree ops[3], tree vectype_in,
5959                                int reduc_index, vec_loop_masks *masks)
5960 {
5961   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5962   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5963   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5964   stmt_vec_info new_stmt_info = NULL;
5965   internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
5966
5967   int ncopies;
5968   if (slp_node)
5969     ncopies = 1;
5970   else
5971     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
5972
5973   gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
5974   gcc_assert (ncopies == 1);
5975   gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
5976   gcc_assert (reduc_index == (code == MINUS_EXPR ? 0 : 1));
5977   gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5978               == FOLD_LEFT_REDUCTION);
5979
5980   if (slp_node)
5981     gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
5982                           TYPE_VECTOR_SUBPARTS (vectype_in)));
5983
5984   tree op0 = ops[1 - reduc_index];
5985
5986   int group_size = 1;
5987   stmt_vec_info scalar_dest_def_info;
5988   auto_vec<tree> vec_oprnds0;
5989   if (slp_node)
5990     {
5991       auto_vec<vec<tree> > vec_defs (2);
5992       auto_vec<tree> sops(2);
5993       sops.quick_push (ops[0]);
5994       sops.quick_push (ops[1]);
5995       vect_get_slp_defs (sops, slp_node, &vec_defs);
5996       vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
5997       vec_defs[0].release ();
5998       vec_defs[1].release ();
5999       group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6000       scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
6001     }
6002   else
6003     {
6004       tree loop_vec_def0 = vect_get_vec_def_for_operand (op0, stmt_info);
6005       vec_oprnds0.create (1);
6006       vec_oprnds0.quick_push (loop_vec_def0);
6007       scalar_dest_def_info = stmt_info;
6008     }
6009
6010   tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
6011   tree scalar_type = TREE_TYPE (scalar_dest);
6012   tree reduc_var = gimple_phi_result (reduc_def_stmt);
6013
6014   int vec_num = vec_oprnds0.length ();
6015   gcc_assert (vec_num == 1 || slp_node);
6016   tree vec_elem_type = TREE_TYPE (vectype_out);
6017   gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
6018
6019   tree vector_identity = NULL_TREE;
6020   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6021     vector_identity = build_zero_cst (vectype_out);
6022
6023   tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
6024   int i;
6025   tree def0;
6026   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6027     {
6028       gimple *new_stmt;
6029       tree mask = NULL_TREE;
6030       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6031         mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
6032
6033       /* Handle MINUS by adding the negative.  */
6034       if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
6035         {
6036           tree negated = make_ssa_name (vectype_out);
6037           new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
6038           gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6039           def0 = negated;
6040         }
6041
6042       if (mask && mask_reduc_fn == IFN_LAST)
6043         def0 = merge_with_identity (gsi, mask, vectype_out, def0,
6044                                     vector_identity);
6045
6046       /* On the first iteration the input is simply the scalar phi
6047          result, and for subsequent iterations it is the output of
6048          the preceding operation.  */
6049       if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
6050         {
6051           if (mask && mask_reduc_fn != IFN_LAST)
6052             new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
6053                                                    def0, mask);
6054           else
6055             new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
6056                                                    def0);
6057           /* For chained SLP reductions the output of the previous reduction
6058              operation serves as the input of the next. For the final statement
6059              the output cannot be a temporary - we reuse the original
6060              scalar destination of the last statement.  */
6061           if (i != vec_num - 1)
6062             {
6063               gimple_set_lhs (new_stmt, scalar_dest_var);
6064               reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
6065               gimple_set_lhs (new_stmt, reduc_var);
6066             }
6067         }
6068       else
6069         {
6070           reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
6071                                              reduc_var, def0);
6072           new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6073           /* Remove the statement, so that we can use the same code paths
6074              as for statements that we've just created.  */
6075           gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6076           gsi_remove (&tmp_gsi, true);
6077         }
6078
6079       if (i == vec_num - 1)
6080         {
6081           gimple_set_lhs (new_stmt, scalar_dest);
6082           new_stmt_info = vect_finish_replace_stmt (scalar_dest_def_info,
6083                                                     new_stmt);
6084         }
6085       else
6086         new_stmt_info = vect_finish_stmt_generation (scalar_dest_def_info,
6087                                                      new_stmt, gsi);
6088
6089       if (slp_node)
6090         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
6091     }
6092
6093   if (!slp_node)
6094     STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
6095
6096   return true;
6097 }
6098
6099 /* Function is_nonwrapping_integer_induction.
6100
6101    Check if STMT_VINO (which is part of loop LOOP) both increments and
6102    does not cause overflow.  */
6103
6104 static bool
6105 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, struct loop *loop)
6106 {
6107   gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
6108   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
6109   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
6110   tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
6111   widest_int ni, max_loop_value, lhs_max;
6112   wi::overflow_type overflow = wi::OVF_NONE;
6113
6114   /* Make sure the loop is integer based.  */
6115   if (TREE_CODE (base) != INTEGER_CST
6116       || TREE_CODE (step) != INTEGER_CST)
6117     return false;
6118
6119   /* Check that the max size of the loop will not wrap.  */
6120
6121   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6122     return true;
6123
6124   if (! max_stmt_executions (loop, &ni))
6125     return false;
6126
6127   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6128                             &overflow);
6129   if (overflow)
6130     return false;
6131
6132   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6133                             TYPE_SIGN (lhs_type), &overflow);
6134   if (overflow)
6135     return false;
6136
6137   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6138           <= TYPE_PRECISION (lhs_type));
6139 }
6140
6141 /* Check if masking can be supported by inserting a conditional expression.
6142    CODE is the code for the operation.  COND_FN is the conditional internal
6143    function, if it exists.  VECTYPE_IN is the type of the vector input.  */
6144 static bool
6145 use_mask_by_cond_expr_p (enum tree_code code, internal_fn cond_fn,
6146                          tree vectype_in)
6147 {
6148   if (cond_fn != IFN_LAST
6149       && direct_internal_fn_supported_p (cond_fn, vectype_in,
6150                                          OPTIMIZE_FOR_SPEED))
6151     return false;
6152
6153   switch (code)
6154     {
6155     case DOT_PROD_EXPR:
6156     case SAD_EXPR:
6157       return true;
6158
6159     default:
6160       return false;
6161     }
6162 }
6163
6164 /* Insert a conditional expression to enable masked vectorization.  CODE is the
6165    code for the operation.  VOP is the array of operands.  MASK is the loop
6166    mask.  GSI is a statement iterator used to place the new conditional
6167    expression.  */
6168 static void
6169 build_vect_cond_expr (enum tree_code code, tree vop[3], tree mask,
6170                       gimple_stmt_iterator *gsi)
6171 {
6172   switch (code)
6173     {
6174     case DOT_PROD_EXPR:
6175       {
6176         tree vectype = TREE_TYPE (vop[1]);
6177         tree zero = build_zero_cst (vectype);
6178         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6179         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6180                                                mask, vop[1], zero);
6181         gsi_insert_before (gsi, select, GSI_SAME_STMT);
6182         vop[1] = masked_op1;
6183         break;
6184       }
6185
6186     case SAD_EXPR:
6187       {
6188         tree vectype = TREE_TYPE (vop[1]);
6189         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6190         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6191                                                mask, vop[1], vop[0]);
6192         gsi_insert_before (gsi, select, GSI_SAME_STMT);
6193         vop[1] = masked_op1;
6194         break;
6195       }
6196
6197     default:
6198       gcc_unreachable ();
6199     }
6200 }
6201
6202 /* Function vectorizable_reduction.
6203
6204    Check if STMT_INFO performs a reduction operation that can be vectorized.
6205    If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
6206    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6207    Return true if STMT_INFO is vectorizable in this way.
6208
6209    This function also handles reduction idioms (patterns) that have been
6210    recognized in advance during vect_pattern_recog.  In this case, STMT_INFO
6211    may be of this form:
6212      X = pattern_expr (arg0, arg1, ..., X)
6213    and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
6214    sequence that had been detected and replaced by the pattern-stmt
6215    (STMT_INFO).
6216
6217    This function also handles reduction of condition expressions, for example:
6218      for (int i = 0; i < N; i++)
6219        if (a[i] < value)
6220          last = a[i];
6221    This is handled by vectorising the loop and creating an additional vector
6222    containing the loop indexes for which "a[i] < value" was true.  In the
6223    function epilogue this is reduced to a single max value and then used to
6224    index into the vector of results.
6225
6226    In some cases of reduction patterns, the type of the reduction variable X is
6227    different than the type of the other arguments of STMT_INFO.
6228    In such cases, the vectype that is used when transforming STMT_INFO into
6229    a vector stmt is different than the vectype that is used to determine the
6230    vectorization factor, because it consists of a different number of elements
6231    than the actual number of elements that are being operated upon in parallel.
6232
6233    For example, consider an accumulation of shorts into an int accumulator.
6234    On some targets it's possible to vectorize this pattern operating on 8
6235    shorts at a time (hence, the vectype for purposes of determining the
6236    vectorization factor should be V8HI); on the other hand, the vectype that
6237    is used to create the vector form is actually V4SI (the type of the result).
6238
6239    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6240    indicates what is the actual level of parallelism (V8HI in the example), so
6241    that the right vectorization factor would be derived.  This vectype
6242    corresponds to the type of arguments to the reduction stmt, and should *NOT*
6243    be used to create the vectorized stmt.  The right vectype for the vectorized
6244    stmt is obtained from the type of the result X:
6245         get_vectype_for_scalar_type (TREE_TYPE (X))
6246
6247    This means that, contrary to "regular" reductions (or "regular" stmts in
6248    general), the following equation:
6249       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
6250    does *NOT* necessarily hold for reduction patterns.  */
6251
6252 bool
6253 vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
6254                         stmt_vec_info *vec_stmt, slp_tree slp_node,
6255                         slp_instance slp_node_instance,
6256                         stmt_vector_for_cost *cost_vec)
6257 {
6258   tree vec_dest;
6259   tree scalar_dest;
6260   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6261   tree vectype_in = NULL_TREE;
6262   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6263   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6264   enum tree_code code, orig_code;
6265   internal_fn reduc_fn;
6266   machine_mode vec_mode;
6267   int op_type;
6268   optab optab;
6269   tree new_temp = NULL_TREE;
6270   enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type;
6271   stmt_vec_info cond_stmt_vinfo = NULL;
6272   enum tree_code cond_reduc_op_code = ERROR_MARK;
6273   tree scalar_type;
6274   bool is_simple_use;
6275   int i;
6276   int ncopies;
6277   int epilog_copies;
6278   stmt_vec_info prev_stmt_info, prev_phi_info;
6279   bool single_defuse_cycle = false;
6280   stmt_vec_info new_stmt_info = NULL;
6281   int j;
6282   tree ops[3];
6283   enum vect_def_type dts[3];
6284   bool nested_cycle = false, found_nested_cycle_def = false;
6285   bool double_reduc = false;
6286   basic_block def_bb;
6287   struct loop * def_stmt_loop;
6288   tree def_arg;
6289   auto_vec<tree> vec_oprnds0;
6290   auto_vec<tree> vec_oprnds1;
6291   auto_vec<tree> vec_oprnds2;
6292   auto_vec<tree> vect_defs;
6293   auto_vec<stmt_vec_info> phis;
6294   int vec_num;
6295   tree def0, tem;
6296   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6297   tree cond_reduc_val = NULL_TREE;
6298
6299   /* Make sure it was already recognized as a reduction computation.  */
6300   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
6301       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
6302     return false;
6303
6304   if (nested_in_vect_loop_p (loop, stmt_info))
6305     {
6306       loop = loop->inner;
6307       nested_cycle = true;
6308     }
6309
6310   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6311     gcc_assert (slp_node
6312                 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
6313
6314   if (gphi *phi = dyn_cast <gphi *> (stmt_info->stmt))
6315     {
6316       tree phi_result = gimple_phi_result (phi);
6317       /* Analysis is fully done on the reduction stmt invocation.  */
6318       if (! vec_stmt)
6319         {
6320           if (slp_node)
6321             slp_node_instance->reduc_phis = slp_node;
6322
6323           STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6324           return true;
6325         }
6326
6327       if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6328         /* Leave the scalar phi in place.  Note that checking
6329            STMT_VINFO_VEC_REDUCTION_TYPE (as below) only works
6330            for reductions involving a single statement.  */
6331         return true;
6332
6333       stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
6334       reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
6335
6336       if (STMT_VINFO_VEC_REDUCTION_TYPE (reduc_stmt_info)
6337           == EXTRACT_LAST_REDUCTION)
6338         /* Leave the scalar phi in place.  */
6339         return true;
6340
6341       gassign *reduc_stmt = as_a <gassign *> (reduc_stmt_info->stmt);
6342       code = gimple_assign_rhs_code (reduc_stmt);
6343       for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k)
6344         {
6345           tree op = gimple_op (reduc_stmt, k);
6346           if (op == phi_result)
6347             continue;
6348           if (k == 1 && code == COND_EXPR)
6349             continue;
6350           bool is_simple_use = vect_is_simple_use (op, loop_vinfo, &dt);
6351           gcc_assert (is_simple_use);
6352           if (dt == vect_constant_def || dt == vect_external_def)
6353             continue;
6354           if (!vectype_in
6355               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6356                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (op)))))
6357             vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op));
6358           break;
6359         }
6360       /* For a nested cycle we might end up with an operation like
6361          phi_result * phi_result.  */
6362       if (!vectype_in)
6363         vectype_in = STMT_VINFO_VECTYPE (stmt_info);
6364       gcc_assert (vectype_in);
6365
6366       if (slp_node)
6367         ncopies = 1;
6368       else
6369         ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6370
6371       stmt_vec_info use_stmt_info;
6372       if (ncopies > 1
6373           && STMT_VINFO_RELEVANT (reduc_stmt_info) <= vect_used_only_live
6374           && (use_stmt_info = loop_vinfo->lookup_single_use (phi_result))
6375           && vect_stmt_to_vectorize (use_stmt_info) == reduc_stmt_info)
6376         single_defuse_cycle = true;
6377
6378       /* Create the destination vector  */
6379       scalar_dest = gimple_assign_lhs (reduc_stmt);
6380       vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6381
6382       if (slp_node)
6383         /* The size vect_schedule_slp_instance computes is off for us.  */
6384         vec_num = vect_get_num_vectors
6385           (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6386            * SLP_TREE_SCALAR_STMTS (slp_node).length (),
6387            vectype_in);
6388       else
6389         vec_num = 1;
6390
6391       /* Generate the reduction PHIs upfront.  */
6392       prev_phi_info = NULL;
6393       for (j = 0; j < ncopies; j++)
6394         {
6395           if (j == 0 || !single_defuse_cycle)
6396             {
6397               for (i = 0; i < vec_num; i++)
6398                 {
6399                   /* Create the reduction-phi that defines the reduction
6400                      operand.  */
6401                   gimple *new_phi = create_phi_node (vec_dest, loop->header);
6402                   stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
6403
6404                   if (slp_node)
6405                     SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi_info);
6406                   else
6407                     {
6408                       if (j == 0)
6409                         STMT_VINFO_VEC_STMT (stmt_info)
6410                           = *vec_stmt = new_phi_info;
6411                       else
6412                         STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi_info;
6413                       prev_phi_info = new_phi_info;
6414                     }
6415                 }
6416             }
6417         }
6418
6419       return true;
6420     }
6421
6422   /* 1. Is vectorizable reduction?  */
6423   /* Not supportable if the reduction variable is used in the loop, unless
6424      it's a reduction chain.  */
6425   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6426       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6427     return false;
6428
6429   /* Reductions that are not used even in an enclosing outer-loop,
6430      are expected to be "live" (used out of the loop).  */
6431   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6432       && !STMT_VINFO_LIVE_P (stmt_info))
6433     return false;
6434
6435   /* 2. Has this been recognized as a reduction pattern?
6436
6437      Check if STMT represents a pattern that has been recognized
6438      in earlier analysis stages.  For stmts that represent a pattern,
6439      the STMT_VINFO_RELATED_STMT field records the last stmt in
6440      the original sequence that constitutes the pattern.  */
6441
6442   stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6443   if (orig_stmt_info)
6444     {
6445       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6446       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6447     }
6448
6449   /* 3. Check the operands of the operation.  The first operands are defined
6450         inside the loop body. The last operand is the reduction variable,
6451         which is defined by the loop-header-phi.  */
6452
6453   gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6454
6455   /* Flatten RHS.  */
6456   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
6457     {
6458     case GIMPLE_BINARY_RHS:
6459       code = gimple_assign_rhs_code (stmt);
6460       op_type = TREE_CODE_LENGTH (code);
6461       gcc_assert (op_type == binary_op);
6462       ops[0] = gimple_assign_rhs1 (stmt);
6463       ops[1] = gimple_assign_rhs2 (stmt);
6464       break;
6465
6466     case GIMPLE_TERNARY_RHS:
6467       code = gimple_assign_rhs_code (stmt);
6468       op_type = TREE_CODE_LENGTH (code);
6469       gcc_assert (op_type == ternary_op);
6470       ops[0] = gimple_assign_rhs1 (stmt);
6471       ops[1] = gimple_assign_rhs2 (stmt);
6472       ops[2] = gimple_assign_rhs3 (stmt);
6473       break;
6474
6475     case GIMPLE_UNARY_RHS:
6476       return false;
6477
6478     default:
6479       gcc_unreachable ();
6480     }
6481
6482   if (code == COND_EXPR && slp_node)
6483     return false;
6484
6485   scalar_dest = gimple_assign_lhs (stmt);
6486   scalar_type = TREE_TYPE (scalar_dest);
6487   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6488       && !SCALAR_FLOAT_TYPE_P (scalar_type))
6489     return false;
6490
6491   /* Do not try to vectorize bit-precision reductions.  */
6492   if (!type_has_mode_precision_p (scalar_type))
6493     return false;
6494
6495   /* All uses but the last are expected to be defined in the loop.
6496      The last use is the reduction variable.  In case of nested cycle this
6497      assumption is not true: we use reduc_index to record the index of the
6498      reduction variable.  */
6499   stmt_vec_info reduc_def_info;
6500   if (orig_stmt_info)
6501     reduc_def_info = STMT_VINFO_REDUC_DEF (orig_stmt_info);
6502   else
6503     reduc_def_info = STMT_VINFO_REDUC_DEF (stmt_info);
6504   gcc_assert (reduc_def_info);
6505   gphi *reduc_def_phi = as_a <gphi *> (reduc_def_info->stmt);
6506   tree reduc_def = PHI_RESULT (reduc_def_phi);
6507   int reduc_index = -1;
6508   for (i = 0; i < op_type; i++)
6509     {
6510       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
6511       if (i == 0 && code == COND_EXPR)
6512         continue;
6513
6514       stmt_vec_info def_stmt_info;
6515       is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, &dts[i], &tem,
6516                                           &def_stmt_info);
6517       dt = dts[i];
6518       gcc_assert (is_simple_use);
6519       if (dt == vect_reduction_def
6520           && ops[i] == reduc_def)
6521         {
6522           reduc_index = i;
6523           continue;
6524         }
6525       else if (tem)
6526         {
6527           /* To properly compute ncopies we are interested in the widest
6528              input type in case we're looking at a widening accumulation.  */
6529           if (!vectype_in
6530               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6531                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem)))))
6532             vectype_in = tem;
6533         }
6534
6535       if (dt != vect_internal_def
6536           && dt != vect_external_def
6537           && dt != vect_constant_def
6538           && dt != vect_induction_def
6539           && !(dt == vect_nested_cycle && nested_cycle))
6540         return false;
6541
6542       if (dt == vect_nested_cycle
6543           && ops[i] == reduc_def)
6544         {
6545           found_nested_cycle_def = true;
6546           reduc_index = i;
6547         }
6548
6549       if (i == 1 && code == COND_EXPR)
6550         {
6551           /* Record how value of COND_EXPR is defined.  */
6552           if (dt == vect_constant_def)
6553             {
6554               cond_reduc_dt = dt;
6555               cond_reduc_val = ops[i];
6556             }
6557           if (dt == vect_induction_def
6558               && def_stmt_info
6559               && is_nonwrapping_integer_induction (def_stmt_info, loop))
6560             {
6561               cond_reduc_dt = dt;
6562               cond_stmt_vinfo = def_stmt_info;
6563             }
6564         }
6565     }
6566
6567   if (!vectype_in)
6568     vectype_in = vectype_out;
6569
6570   /* When vectorizing a reduction chain w/o SLP the reduction PHI is not
6571      directy used in stmt.  */
6572   if (reduc_index == -1)
6573     {
6574       if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6575         {
6576           if (dump_enabled_p ())
6577             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6578                              "in-order reduction chain without SLP.\n");
6579           return false;
6580         }
6581     }
6582
6583   if (!(reduc_index == -1
6584         || dts[reduc_index] == vect_reduction_def
6585         || dts[reduc_index] == vect_nested_cycle
6586         || ((dts[reduc_index] == vect_internal_def
6587              || dts[reduc_index] == vect_external_def
6588              || dts[reduc_index] == vect_constant_def
6589              || dts[reduc_index] == vect_induction_def)
6590             && nested_cycle && found_nested_cycle_def)))
6591     {
6592       /* For pattern recognized stmts, orig_stmt might be a reduction,
6593          but some helper statements for the pattern might not, or
6594          might be COND_EXPRs with reduction uses in the condition.  */
6595       gcc_assert (orig_stmt_info);
6596       return false;
6597     }
6598
6599   /* PHIs should not participate in patterns.  */
6600   gcc_assert (!STMT_VINFO_RELATED_STMT (reduc_def_info));
6601   enum vect_reduction_type v_reduc_type
6602     = STMT_VINFO_REDUC_TYPE (reduc_def_info);
6603   stmt_vec_info tmp = STMT_VINFO_REDUC_DEF (reduc_def_info);
6604
6605   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
6606   /* If we have a condition reduction, see if we can simplify it further.  */
6607   if (v_reduc_type == COND_REDUCTION)
6608     {
6609       /* TODO: We can't yet handle reduction chains, since we need to treat
6610          each COND_EXPR in the chain specially, not just the last one.
6611          E.g. for:
6612
6613             x_1 = PHI <x_3, ...>
6614             x_2 = a_2 ? ... : x_1;
6615             x_3 = a_3 ? ... : x_2;
6616
6617          we're interested in the last element in x_3 for which a_2 || a_3
6618          is true, whereas the current reduction chain handling would
6619          vectorize x_2 as a normal VEC_COND_EXPR and only treat x_3
6620          as a reduction operation.  */
6621       if (reduc_index == -1)
6622         {
6623           if (dump_enabled_p ())
6624             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6625                              "conditional reduction chains not supported\n");
6626           return false;
6627         }
6628
6629       /* vect_is_simple_reduction ensured that operand 2 is the
6630          loop-carried operand.  */
6631       gcc_assert (reduc_index == 2);
6632
6633       /* Loop peeling modifies initial value of reduction PHI, which
6634          makes the reduction stmt to be transformed different to the
6635          original stmt analyzed.  We need to record reduction code for
6636          CONST_COND_REDUCTION type reduction at analyzing stage, thus
6637          it can be used directly at transform stage.  */
6638       if (STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MAX_EXPR
6639           || STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MIN_EXPR)
6640         {
6641           /* Also set the reduction type to CONST_COND_REDUCTION.  */
6642           gcc_assert (cond_reduc_dt == vect_constant_def);
6643           STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = CONST_COND_REDUCTION;
6644         }
6645       else if (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6646                                                vectype_in, OPTIMIZE_FOR_SPEED))
6647         {
6648           if (dump_enabled_p ())
6649             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6650                              "optimizing condition reduction with"
6651                              " FOLD_EXTRACT_LAST.\n");
6652           STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = EXTRACT_LAST_REDUCTION;
6653         }
6654       else if (cond_reduc_dt == vect_induction_def)
6655         {
6656           tree base
6657             = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6658           tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6659
6660           gcc_assert (TREE_CODE (base) == INTEGER_CST
6661                       && TREE_CODE (step) == INTEGER_CST);
6662           cond_reduc_val = NULL_TREE;
6663           /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6664              above base; punt if base is the minimum value of the type for
6665              MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
6666           if (tree_int_cst_sgn (step) == -1)
6667             {
6668               cond_reduc_op_code = MIN_EXPR;
6669               if (tree_int_cst_sgn (base) == -1)
6670                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6671               else if (tree_int_cst_lt (base,
6672                                         TYPE_MAX_VALUE (TREE_TYPE (base))))
6673                 cond_reduc_val
6674                   = int_const_binop (PLUS_EXPR, base, integer_one_node);
6675             }
6676           else
6677             {
6678               cond_reduc_op_code = MAX_EXPR;
6679               if (tree_int_cst_sgn (base) == 1)
6680                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6681               else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6682                                         base))
6683                 cond_reduc_val
6684                   = int_const_binop (MINUS_EXPR, base, integer_one_node);
6685             }
6686           if (cond_reduc_val)
6687             {
6688               if (dump_enabled_p ())
6689                 dump_printf_loc (MSG_NOTE, vect_location,
6690                                  "condition expression based on "
6691                                  "integer induction.\n");
6692               STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6693                 = INTEGER_INDUC_COND_REDUCTION;
6694             }
6695         }
6696       else if (cond_reduc_dt == vect_constant_def)
6697         {
6698           enum vect_def_type cond_initial_dt;
6699           gimple *def_stmt = SSA_NAME_DEF_STMT (ops[reduc_index]);
6700           tree cond_initial_val
6701             = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
6702
6703           gcc_assert (cond_reduc_val != NULL_TREE);
6704           vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
6705           if (cond_initial_dt == vect_constant_def
6706               && types_compatible_p (TREE_TYPE (cond_initial_val),
6707                                      TREE_TYPE (cond_reduc_val)))
6708             {
6709               tree e = fold_binary (LE_EXPR, boolean_type_node,
6710                                     cond_initial_val, cond_reduc_val);
6711               if (e && (integer_onep (e) || integer_zerop (e)))
6712                 {
6713                   if (dump_enabled_p ())
6714                     dump_printf_loc (MSG_NOTE, vect_location,
6715                                      "condition expression based on "
6716                                      "compile time constant.\n");
6717                   /* Record reduction code at analysis stage.  */
6718                   STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info)
6719                     = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6720                   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6721                     = CONST_COND_REDUCTION;
6722                 }
6723             }
6724         }
6725     }
6726
6727   if (orig_stmt_info)
6728     gcc_assert (tmp == orig_stmt_info
6729                 || REDUC_GROUP_FIRST_ELEMENT (tmp) == orig_stmt_info);
6730   else
6731     /* We changed STMT to be the first stmt in reduction chain, hence we
6732        check that in this case the first element in the chain is STMT.  */
6733     gcc_assert (tmp == stmt_info
6734                 || REDUC_GROUP_FIRST_ELEMENT (tmp) == stmt_info);
6735
6736   if (STMT_VINFO_LIVE_P (reduc_def_info))
6737     return false;
6738
6739   if (slp_node)
6740     ncopies = 1;
6741   else
6742     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6743
6744   gcc_assert (ncopies >= 1);
6745
6746   vec_mode = TYPE_MODE (vectype_in);
6747   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6748
6749   if (nested_cycle)
6750     {
6751       def_bb = gimple_bb (reduc_def_phi);
6752       def_stmt_loop = def_bb->loop_father;
6753       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
6754                                        loop_preheader_edge (def_stmt_loop));
6755       stmt_vec_info def_arg_stmt_info = loop_vinfo->lookup_def (def_arg);
6756       if (def_arg_stmt_info
6757           && (STMT_VINFO_DEF_TYPE (def_arg_stmt_info)
6758               == vect_double_reduction_def))
6759         double_reduc = true;
6760     }
6761
6762   vect_reduction_type reduction_type
6763     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
6764   if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
6765       && ncopies > 1)
6766     {
6767       if (dump_enabled_p ())
6768         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6769                          "multiple types in double reduction or condition "
6770                          "reduction.\n");
6771       return false;
6772     }
6773
6774   if (code == COND_EXPR)
6775     {
6776       /* Only call during the analysis stage, otherwise we'll lose
6777          STMT_VINFO_TYPE.  */
6778       if (!vec_stmt && !vectorizable_condition (stmt_info, gsi, NULL,
6779                                                 true, NULL, cost_vec))
6780         {
6781           if (dump_enabled_p ())
6782             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6783                              "unsupported condition in reduction\n");
6784           return false;
6785         }
6786     }
6787   else if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
6788            || code == LROTATE_EXPR || code == RROTATE_EXPR)
6789     {
6790       /* Only call during the analysis stage, otherwise we'll lose
6791          STMT_VINFO_TYPE.  We only support this for nested cycles
6792          without double reductions at the moment.  */
6793       if (!nested_cycle
6794           || double_reduc
6795           || (!vec_stmt && !vectorizable_shift (stmt_info, gsi, NULL,
6796                                                 NULL, cost_vec)))
6797         {
6798           if (dump_enabled_p ())
6799             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6800                              "unsupported shift or rotation in reduction\n");
6801           return false;
6802         }
6803     }
6804   else
6805     {
6806       /* 4. Supportable by target?  */
6807
6808       /* 4.1. check support for the operation in the loop  */
6809       optab = optab_for_tree_code (code, vectype_in, optab_default);
6810       if (!optab)
6811         {
6812           if (dump_enabled_p ())
6813             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6814                              "no optab.\n");
6815
6816           return false;
6817         }
6818
6819       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6820         {
6821           if (dump_enabled_p ())
6822             dump_printf (MSG_NOTE, "op not supported by target.\n");
6823
6824           if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
6825               || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6826             return false;
6827
6828           if (dump_enabled_p ())
6829             dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6830         }
6831
6832       /* Worthwhile without SIMD support?  */
6833       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
6834           && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6835         {
6836           if (dump_enabled_p ())
6837             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6838                              "not worthwhile without SIMD support.\n");
6839
6840           return false;
6841         }
6842     }
6843
6844   /* 4.2. Check support for the epilog operation.
6845
6846           If STMT represents a reduction pattern, then the type of the
6847           reduction variable may be different than the type of the rest
6848           of the arguments.  For example, consider the case of accumulation
6849           of shorts into an int accumulator; The original code:
6850                         S1: int_a = (int) short_a;
6851           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
6852
6853           was replaced with:
6854                         STMT: int_acc = widen_sum <short_a, int_acc>
6855
6856           This means that:
6857           1. The tree-code that is used to create the vector operation in the
6858              epilog code (that reduces the partial results) is not the
6859              tree-code of STMT, but is rather the tree-code of the original
6860              stmt from the pattern that STMT is replacing.  I.e, in the example
6861              above we want to use 'widen_sum' in the loop, but 'plus' in the
6862              epilog.
6863           2. The type (mode) we use to check available target support
6864              for the vector operation to be created in the *epilog*, is
6865              determined by the type of the reduction variable (in the example
6866              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6867              However the type (mode) we use to check available target support
6868              for the vector operation to be created *inside the loop*, is
6869              determined by the type of the other arguments to STMT (in the
6870              example we'd check this: optab_handler (widen_sum_optab,
6871              vect_short_mode)).
6872
6873           This is contrary to "regular" reductions, in which the types of all
6874           the arguments are the same as the type of the reduction variable.
6875           For "regular" reductions we can therefore use the same vector type
6876           (and also the same tree-code) when generating the epilog code and
6877           when generating the code inside the loop.  */
6878
6879   if (orig_stmt_info
6880       && (reduction_type == TREE_CODE_REDUCTION
6881           || reduction_type == FOLD_LEFT_REDUCTION))
6882     {
6883       /* This is a reduction pattern: get the vectype from the type of the
6884          reduction variable, and get the tree-code from orig_stmt.  */
6885       orig_code = gimple_assign_rhs_code (orig_stmt_info->stmt);
6886       gcc_assert (vectype_out);
6887       vec_mode = TYPE_MODE (vectype_out);
6888     }
6889   else
6890     {
6891       /* Regular reduction: use the same vectype and tree-code as used for
6892          the vector code inside the loop can be used for the epilog code. */
6893       orig_code = code;
6894
6895       if (code == MINUS_EXPR)
6896         orig_code = PLUS_EXPR;
6897
6898       /* For simple condition reductions, replace with the actual expression
6899          we want to base our reduction around.  */
6900       if (reduction_type == CONST_COND_REDUCTION)
6901         {
6902           orig_code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
6903           gcc_assert (orig_code == MAX_EXPR || orig_code == MIN_EXPR);
6904         }
6905       else if (reduction_type == INTEGER_INDUC_COND_REDUCTION)
6906         orig_code = cond_reduc_op_code;
6907     }
6908
6909   reduc_fn = IFN_LAST;
6910
6911   if (reduction_type == TREE_CODE_REDUCTION
6912       || reduction_type == FOLD_LEFT_REDUCTION
6913       || reduction_type == INTEGER_INDUC_COND_REDUCTION
6914       || reduction_type == CONST_COND_REDUCTION)
6915     {
6916       if (reduction_type == FOLD_LEFT_REDUCTION
6917           ? fold_left_reduction_fn (orig_code, &reduc_fn)
6918           : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6919         {
6920           if (reduc_fn != IFN_LAST
6921               && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6922                                                   OPTIMIZE_FOR_SPEED))
6923             {
6924               if (dump_enabled_p ())
6925                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6926                                  "reduc op not supported by target.\n");
6927
6928               reduc_fn = IFN_LAST;
6929             }
6930         }
6931       else
6932         {
6933           if (!nested_cycle || double_reduc)
6934             {
6935               if (dump_enabled_p ())
6936                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6937                                  "no reduc code for scalar code.\n");
6938
6939               return false;
6940             }
6941         }
6942     }
6943   else if (reduction_type == COND_REDUCTION)
6944     {
6945       int scalar_precision
6946         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6947       cr_index_scalar_type = make_unsigned_type (scalar_precision);
6948       cr_index_vector_type = build_vector_type (cr_index_scalar_type,
6949                                                 nunits_out);
6950
6951       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
6952                                           OPTIMIZE_FOR_SPEED))
6953         reduc_fn = IFN_REDUC_MAX;
6954     }
6955
6956   if (reduction_type != EXTRACT_LAST_REDUCTION
6957       && (!nested_cycle || double_reduc)
6958       && reduc_fn == IFN_LAST
6959       && !nunits_out.is_constant ())
6960     {
6961       if (dump_enabled_p ())
6962         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6963                          "missing target support for reduction on"
6964                          " variable-length vectors.\n");
6965       return false;
6966     }
6967
6968   /* For SLP reductions, see if there is a neutral value we can use.  */
6969   tree neutral_op = NULL_TREE;
6970   if (slp_node)
6971     neutral_op = neutral_op_for_slp_reduction
6972       (slp_node_instance->reduc_phis, code,
6973        REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL);
6974
6975   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
6976     {
6977       /* We can't support in-order reductions of code such as this:
6978
6979            for (int i = 0; i < n1; ++i)
6980              for (int j = 0; j < n2; ++j)
6981                l += a[j];
6982
6983          since GCC effectively transforms the loop when vectorizing:
6984
6985            for (int i = 0; i < n1 / VF; ++i)
6986              for (int j = 0; j < n2; ++j)
6987                for (int k = 0; k < VF; ++k)
6988                  l += a[j];
6989
6990          which is a reassociation of the original operation.  */
6991       if (dump_enabled_p ())
6992         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6993                          "in-order double reduction not supported.\n");
6994
6995       return false;
6996     }
6997
6998   if (reduction_type == FOLD_LEFT_REDUCTION
6999       && slp_node
7000       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7001     {
7002       /* We cannot use in-order reductions in this case because there is
7003          an implicit reassociation of the operations involved.  */
7004       if (dump_enabled_p ())
7005         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7006                          "in-order unchained SLP reductions not supported.\n");
7007       return false;
7008     }
7009
7010   /* For double reductions, and for SLP reductions with a neutral value,
7011      we construct a variable-length initial vector by loading a vector
7012      full of the neutral value and then shift-and-inserting the start
7013      values into the low-numbered elements.  */
7014   if ((double_reduc || neutral_op)
7015       && !nunits_out.is_constant ()
7016       && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
7017                                           vectype_out, OPTIMIZE_FOR_SPEED))
7018     {
7019       if (dump_enabled_p ())
7020         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7021                          "reduction on variable-length vectors requires"
7022                          " target support for a vector-shift-and-insert"
7023                          " operation.\n");
7024       return false;
7025     }
7026
7027   /* Check extra constraints for variable-length unchained SLP reductions.  */
7028   if (STMT_SLP_TYPE (stmt_info)
7029       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7030       && !nunits_out.is_constant ())
7031     {
7032       /* We checked above that we could build the initial vector when
7033          there's a neutral element value.  Check here for the case in
7034          which each SLP statement has its own initial value and in which
7035          that value needs to be repeated for every instance of the
7036          statement within the initial vector.  */
7037       unsigned int group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7038       scalar_mode elt_mode = SCALAR_TYPE_MODE (TREE_TYPE (vectype_out));
7039       if (!neutral_op
7040           && !can_duplicate_and_interleave_p (group_size, elt_mode))
7041         {
7042           if (dump_enabled_p ())
7043             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7044                              "unsupported form of SLP reduction for"
7045                              " variable-length vectors: cannot build"
7046                              " initial vector.\n");
7047           return false;
7048         }
7049       /* The epilogue code relies on the number of elements being a multiple
7050          of the group size.  The duplicate-and-interleave approach to setting
7051          up the the initial vector does too.  */
7052       if (!multiple_p (nunits_out, group_size))
7053         {
7054           if (dump_enabled_p ())
7055             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7056                              "unsupported form of SLP reduction for"
7057                              " variable-length vectors: the vector size"
7058                              " is not a multiple of the number of results.\n");
7059           return false;
7060         }
7061     }
7062
7063   /* In case of widenning multiplication by a constant, we update the type
7064      of the constant to be the type of the other operand.  We check that the
7065      constant fits the type in the pattern recognition pass.  */
7066   if (code == DOT_PROD_EXPR
7067       && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
7068     {
7069       if (TREE_CODE (ops[0]) == INTEGER_CST)
7070         ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
7071       else if (TREE_CODE (ops[1]) == INTEGER_CST)
7072         ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
7073       else
7074         {
7075           if (dump_enabled_p ())
7076             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7077                              "invalid types in dot-prod\n");
7078
7079           return false;
7080         }
7081     }
7082
7083   if (reduction_type == COND_REDUCTION)
7084     {
7085       widest_int ni;
7086
7087       if (! max_loop_iterations (loop, &ni))
7088         {
7089           if (dump_enabled_p ())
7090             dump_printf_loc (MSG_NOTE, vect_location,
7091                              "loop count not known, cannot create cond "
7092                              "reduction.\n");
7093           return false;
7094         }
7095       /* Convert backedges to iterations.  */
7096       ni += 1;
7097
7098       /* The additional index will be the same type as the condition.  Check
7099          that the loop can fit into this less one (because we'll use up the
7100          zero slot for when there are no matches).  */
7101       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
7102       if (wi::geu_p (ni, wi::to_widest (max_index)))
7103         {
7104           if (dump_enabled_p ())
7105             dump_printf_loc (MSG_NOTE, vect_location,
7106                              "loop size is greater than data size.\n");
7107           return false;
7108         }
7109     }
7110
7111   /* In case the vectorization factor (VF) is bigger than the number
7112      of elements that we can fit in a vectype (nunits), we have to generate
7113      more than one vector stmt - i.e - we need to "unroll" the
7114      vector stmt by a factor VF/nunits.  For more details see documentation
7115      in vectorizable_operation.  */
7116
7117   /* If the reduction is used in an outer loop we need to generate
7118      VF intermediate results, like so (e.g. for ncopies=2):
7119         r0 = phi (init, r0)
7120         r1 = phi (init, r1)
7121         r0 = x0 + r0;
7122         r1 = x1 + r1;
7123     (i.e. we generate VF results in 2 registers).
7124     In this case we have a separate def-use cycle for each copy, and therefore
7125     for each copy we get the vector def for the reduction variable from the
7126     respective phi node created for this copy.
7127
7128     Otherwise (the reduction is unused in the loop nest), we can combine
7129     together intermediate results, like so (e.g. for ncopies=2):
7130         r = phi (init, r)
7131         r = x0 + r;
7132         r = x1 + r;
7133    (i.e. we generate VF/2 results in a single register).
7134    In this case for each copy we get the vector def for the reduction variable
7135    from the vectorized reduction operation generated in the previous iteration.
7136
7137    This only works when we see both the reduction PHI and its only consumer
7138    in vectorizable_reduction and there are no intermediate stmts
7139    participating.  */
7140   stmt_vec_info use_stmt_info;
7141   tree reduc_phi_result = gimple_phi_result (reduc_def_phi);
7142   if (ncopies > 1
7143       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
7144       && (use_stmt_info = loop_vinfo->lookup_single_use (reduc_phi_result))
7145       && vect_stmt_to_vectorize (use_stmt_info) == stmt_info)
7146     {
7147       single_defuse_cycle = true;
7148       epilog_copies = 1;
7149     }
7150   else
7151     epilog_copies = ncopies;
7152
7153   /* If the reduction stmt is one of the patterns that have lane
7154      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
7155   if ((ncopies > 1
7156        && ! single_defuse_cycle)
7157       && (code == DOT_PROD_EXPR
7158           || code == WIDEN_SUM_EXPR
7159           || code == SAD_EXPR))
7160     {
7161       if (dump_enabled_p ())
7162         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7163                          "multi def-use cycle not possible for lane-reducing "
7164                          "reduction operation\n");
7165       return false;
7166     }
7167
7168   if (slp_node)
7169     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7170   else
7171     vec_num = 1;
7172
7173   internal_fn cond_fn = get_conditional_internal_fn (code);
7174   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7175   bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
7176
7177   if (!vec_stmt) /* transformation not required.  */
7178     {
7179       vect_model_reduction_cost (stmt_info, reduc_fn, ncopies, cost_vec);
7180       if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
7181         {
7182           if (reduction_type != FOLD_LEFT_REDUCTION
7183               && !mask_by_cond_expr
7184               && (cond_fn == IFN_LAST
7185                   || !direct_internal_fn_supported_p (cond_fn, vectype_in,
7186                                                       OPTIMIZE_FOR_SPEED)))
7187             {
7188               if (dump_enabled_p ())
7189                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7190                                  "can't use a fully-masked loop because no"
7191                                  " conditional operation is available.\n");
7192               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7193             }
7194           else if (reduc_index == -1)
7195             {
7196               if (dump_enabled_p ())
7197                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7198                                  "can't use a fully-masked loop for chained"
7199                                  " reductions.\n");
7200               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7201             }
7202           else
7203             vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
7204                                    vectype_in);
7205         }
7206       if (dump_enabled_p ()
7207           && reduction_type == FOLD_LEFT_REDUCTION)
7208         dump_printf_loc (MSG_NOTE, vect_location,
7209                          "using an in-order (fold-left) reduction.\n");
7210       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7211       return true;
7212     }
7213
7214   /* Transform.  */
7215
7216   if (dump_enabled_p ())
7217     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7218
7219   /* FORNOW: Multiple types are not supported for condition.  */
7220   if (code == COND_EXPR)
7221     gcc_assert (ncopies == 1);
7222
7223   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7224
7225   if (reduction_type == FOLD_LEFT_REDUCTION)
7226     return vectorize_fold_left_reduction
7227       (stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, code,
7228        reduc_fn, ops, vectype_in, reduc_index, masks);
7229
7230   if (reduction_type == EXTRACT_LAST_REDUCTION)
7231     {
7232       gcc_assert (!slp_node);
7233       return vectorizable_condition (stmt_info, gsi, vec_stmt,
7234                                      true, NULL, NULL);
7235     }
7236
7237   /* Create the destination vector  */
7238   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7239
7240   prev_stmt_info = NULL;
7241   prev_phi_info = NULL;
7242   if (!slp_node)
7243     {
7244       vec_oprnds0.create (1);
7245       vec_oprnds1.create (1);
7246       if (op_type == ternary_op)
7247         vec_oprnds2.create (1);
7248     }
7249
7250   phis.create (vec_num);
7251   vect_defs.create (vec_num);
7252   if (!slp_node)
7253     vect_defs.quick_push (NULL_TREE);
7254
7255   if (slp_node)
7256     phis.splice (SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis));
7257   else
7258     phis.quick_push (STMT_VINFO_VEC_STMT (reduc_def_info));
7259
7260   for (j = 0; j < ncopies; j++)
7261     {
7262       if (code == COND_EXPR)
7263         {
7264           gcc_assert (!slp_node);
7265           vectorizable_condition (stmt_info, gsi, vec_stmt,
7266                                   true, NULL, NULL);
7267           break;
7268         }
7269       if (code == LSHIFT_EXPR
7270           || code == RSHIFT_EXPR)
7271         {
7272           vectorizable_shift (stmt_info, gsi, vec_stmt, slp_node, NULL);
7273           break;
7274         }
7275
7276       /* Handle uses.  */
7277       if (j == 0)
7278         {
7279           if (slp_node)
7280             {
7281               /* Get vec defs for all the operands except the reduction index,
7282                  ensuring the ordering of the ops in the vector is kept.  */
7283               auto_vec<tree, 3> slp_ops;
7284               auto_vec<vec<tree>, 3> vec_defs;
7285
7286               slp_ops.quick_push (ops[0]);
7287               slp_ops.quick_push (ops[1]);
7288               if (op_type == ternary_op)
7289                 slp_ops.quick_push (ops[2]);
7290
7291               vect_get_slp_defs (slp_ops, slp_node, &vec_defs);
7292
7293               vec_oprnds0.safe_splice (vec_defs[0]);
7294               vec_defs[0].release ();
7295               vec_oprnds1.safe_splice (vec_defs[1]);
7296               vec_defs[1].release ();
7297               if (op_type == ternary_op)
7298                 {
7299                   vec_oprnds2.safe_splice (vec_defs[2]);
7300                   vec_defs[2].release ();
7301                 }
7302             }
7303           else
7304             {
7305               vec_oprnds0.quick_push
7306                 (vect_get_vec_def_for_operand (ops[0], stmt_info));
7307               vec_oprnds1.quick_push
7308                 (vect_get_vec_def_for_operand (ops[1], stmt_info));
7309               if (op_type == ternary_op)
7310                 vec_oprnds2.quick_push
7311                   (vect_get_vec_def_for_operand (ops[2], stmt_info));
7312             }
7313         }
7314       else
7315         {
7316           if (!slp_node)
7317             {
7318               gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
7319
7320               if (single_defuse_cycle && reduc_index == 0)
7321                 vec_oprnds0[0] = gimple_get_lhs (new_stmt_info->stmt);
7322               else
7323                 vec_oprnds0[0]
7324                   = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7325                                                     vec_oprnds0[0]);
7326               if (single_defuse_cycle && reduc_index == 1)
7327                 vec_oprnds1[0] = gimple_get_lhs (new_stmt_info->stmt);
7328               else
7329                 vec_oprnds1[0]
7330                   = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7331                                                     vec_oprnds1[0]);
7332               if (op_type == ternary_op)
7333                 {
7334                   if (single_defuse_cycle && reduc_index == 2)
7335                     vec_oprnds2[0] = gimple_get_lhs (new_stmt_info->stmt);
7336                   else
7337                     vec_oprnds2[0]
7338                       = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7339                                                         vec_oprnds2[0]);
7340                 }
7341             }
7342         }
7343
7344       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7345         {
7346           tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7347           if (masked_loop_p && !mask_by_cond_expr)
7348             {
7349               /* Make sure that the reduction accumulator is vop[0].  */
7350               if (reduc_index == 1)
7351                 {
7352                   gcc_assert (commutative_tree_code (code));
7353                   std::swap (vop[0], vop[1]);
7354                 }
7355               tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7356                                               vectype_in, i * ncopies + j);
7357               gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
7358                                                         vop[0], vop[1],
7359                                                         vop[0]);
7360               new_temp = make_ssa_name (vec_dest, call);
7361               gimple_call_set_lhs (call, new_temp);
7362               gimple_call_set_nothrow (call, true);
7363               new_stmt_info
7364                 = vect_finish_stmt_generation (stmt_info, call, gsi);
7365             }
7366           else
7367             {
7368               if (op_type == ternary_op)
7369                 vop[2] = vec_oprnds2[i];
7370
7371               if (masked_loop_p && mask_by_cond_expr)
7372                 {
7373                   tree mask = vect_get_loop_mask (gsi, masks,
7374                                                   vec_num * ncopies,
7375                                                   vectype_in, i * ncopies + j);
7376                   build_vect_cond_expr (code, vop, mask, gsi);
7377                 }
7378
7379               gassign *new_stmt = gimple_build_assign (vec_dest, code,
7380                                                        vop[0], vop[1], vop[2]);
7381               new_temp = make_ssa_name (vec_dest, new_stmt);
7382               gimple_assign_set_lhs (new_stmt, new_temp);
7383               new_stmt_info
7384                 = vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
7385             }
7386
7387           if (slp_node)
7388             {
7389               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
7390               vect_defs.quick_push (new_temp);
7391             }
7392           else
7393             vect_defs[0] = new_temp;
7394         }
7395
7396       if (slp_node)
7397         continue;
7398
7399       if (j == 0)
7400         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
7401       else
7402         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
7403
7404       prev_stmt_info = new_stmt_info;
7405     }
7406
7407   /* Finalize the reduction-phi (set its arguments) and create the
7408      epilog reduction code.  */
7409   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
7410     vect_defs[0] = gimple_get_lhs ((*vec_stmt)->stmt);
7411
7412   vect_create_epilog_for_reduction (vect_defs, stmt_info, reduc_def_phi,
7413                                     epilog_copies, reduc_fn, phis,
7414                                     double_reduc, slp_node, slp_node_instance,
7415                                     cond_reduc_val, cond_reduc_op_code,
7416                                     neutral_op);
7417
7418   return true;
7419 }
7420
7421 /* Function vect_min_worthwhile_factor.
7422
7423    For a loop where we could vectorize the operation indicated by CODE,
7424    return the minimum vectorization factor that makes it worthwhile
7425    to use generic vectors.  */
7426 static unsigned int
7427 vect_min_worthwhile_factor (enum tree_code code)
7428 {
7429   switch (code)
7430     {
7431     case PLUS_EXPR:
7432     case MINUS_EXPR:
7433     case NEGATE_EXPR:
7434       return 4;
7435
7436     case BIT_AND_EXPR:
7437     case BIT_IOR_EXPR:
7438     case BIT_XOR_EXPR:
7439     case BIT_NOT_EXPR:
7440       return 2;
7441
7442     default:
7443       return INT_MAX;
7444     }
7445 }
7446
7447 /* Return true if VINFO indicates we are doing loop vectorization and if
7448    it is worth decomposing CODE operations into scalar operations for
7449    that loop's vectorization factor.  */
7450
7451 bool
7452 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
7453 {
7454   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7455   unsigned HOST_WIDE_INT value;
7456   return (loop_vinfo
7457           && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7458           && value >= vect_min_worthwhile_factor (code));
7459 }
7460
7461 /* Function vectorizable_induction
7462
7463    Check if STMT_INFO performs an induction computation that can be vectorized.
7464    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7465    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7466    Return true if STMT_INFO is vectorizable in this way.  */
7467
7468 bool
7469 vectorizable_induction (stmt_vec_info stmt_info,
7470                         gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7471                         stmt_vec_info *vec_stmt, slp_tree slp_node,
7472                         stmt_vector_for_cost *cost_vec)
7473 {
7474   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7475   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7476   unsigned ncopies;
7477   bool nested_in_vect_loop = false;
7478   struct loop *iv_loop;
7479   tree vec_def;
7480   edge pe = loop_preheader_edge (loop);
7481   basic_block new_bb;
7482   tree new_vec, vec_init, vec_step, t;
7483   tree new_name;
7484   gimple *new_stmt;
7485   gphi *induction_phi;
7486   tree induc_def, vec_dest;
7487   tree init_expr, step_expr;
7488   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7489   unsigned i;
7490   tree expr;
7491   gimple_seq stmts;
7492   imm_use_iterator imm_iter;
7493   use_operand_p use_p;
7494   gimple *exit_phi;
7495   edge latch_e;
7496   tree loop_arg;
7497   gimple_stmt_iterator si;
7498
7499   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
7500   if (!phi)
7501     return false;
7502
7503   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7504     return false;
7505
7506   /* Make sure it was recognized as induction computation.  */
7507   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7508     return false;
7509
7510   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7511   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7512
7513   if (slp_node)
7514     ncopies = 1;
7515   else
7516     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7517   gcc_assert (ncopies >= 1);
7518
7519   /* FORNOW. These restrictions should be relaxed.  */
7520   if (nested_in_vect_loop_p (loop, stmt_info))
7521     {
7522       imm_use_iterator imm_iter;
7523       use_operand_p use_p;
7524       gimple *exit_phi;
7525       edge latch_e;
7526       tree loop_arg;
7527
7528       if (ncopies > 1)
7529         {
7530           if (dump_enabled_p ())
7531             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7532                              "multiple types in nested loop.\n");
7533           return false;
7534         }
7535
7536       /* FORNOW: outer loop induction with SLP not supported.  */
7537       if (STMT_SLP_TYPE (stmt_info))
7538         return false;
7539
7540       exit_phi = NULL;
7541       latch_e = loop_latch_edge (loop->inner);
7542       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7543       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7544         {
7545           gimple *use_stmt = USE_STMT (use_p);
7546           if (is_gimple_debug (use_stmt))
7547             continue;
7548
7549           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7550             {
7551               exit_phi = use_stmt;
7552               break;
7553             }
7554         }
7555       if (exit_phi)
7556         {
7557           stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7558           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7559                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7560             {
7561               if (dump_enabled_p ())
7562                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7563                                  "inner-loop induction only used outside "
7564                                  "of the outer vectorized loop.\n");
7565               return false;
7566             }
7567         }
7568
7569       nested_in_vect_loop = true;
7570       iv_loop = loop->inner;
7571     }
7572   else
7573     iv_loop = loop;
7574   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7575
7576   if (slp_node && !nunits.is_constant ())
7577     {
7578       /* The current SLP code creates the initial value element-by-element.  */
7579       if (dump_enabled_p ())
7580         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7581                          "SLP induction not supported for variable-length"
7582                          " vectors.\n");
7583       return false;
7584     }
7585
7586   if (!vec_stmt) /* transformation not required.  */
7587     {
7588       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
7589       DUMP_VECT_SCOPE ("vectorizable_induction");
7590       vect_model_induction_cost (stmt_info, ncopies, cost_vec);
7591       return true;
7592     }
7593
7594   /* Transform.  */
7595
7596   /* Compute a vector variable, initialized with the first VF values of
7597      the induction variable.  E.g., for an iv with IV_PHI='X' and
7598      evolution S, for a vector of 4 units, we want to compute:
7599      [X, X + S, X + 2*S, X + 3*S].  */
7600
7601   if (dump_enabled_p ())
7602     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
7603
7604   latch_e = loop_latch_edge (iv_loop);
7605   loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7606
7607   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
7608   gcc_assert (step_expr != NULL_TREE);
7609
7610   pe = loop_preheader_edge (iv_loop);
7611   init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
7612                                      loop_preheader_edge (iv_loop));
7613
7614   stmts = NULL;
7615   if (!nested_in_vect_loop)
7616     {
7617       /* Convert the initial value to the desired type.  */
7618       tree new_type = TREE_TYPE (vectype);
7619       init_expr = gimple_convert (&stmts, new_type, init_expr);
7620
7621       /* If we are using the loop mask to "peel" for alignment then we need
7622          to adjust the start value here.  */
7623       tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
7624       if (skip_niters != NULL_TREE)
7625         {
7626           if (FLOAT_TYPE_P (vectype))
7627             skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
7628                                         skip_niters);
7629           else
7630             skip_niters = gimple_convert (&stmts, new_type, skip_niters);
7631           tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
7632                                          skip_niters, step_expr);
7633           init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
7634                                     init_expr, skip_step);
7635         }
7636     }
7637
7638   /* Convert the step to the desired type.  */
7639   step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
7640
7641   if (stmts)
7642     {
7643       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7644       gcc_assert (!new_bb);
7645     }
7646
7647   /* Find the first insertion point in the BB.  */
7648   basic_block bb = gimple_bb (phi);
7649   si = gsi_after_labels (bb);
7650
7651   /* For SLP induction we have to generate several IVs as for example
7652      with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
7653      [i + 2*S, i + 3*S, i + 3*S, i + 3*S].  The step is the same uniform
7654      [VF*S, VF*S, VF*S, VF*S] for all.  */
7655   if (slp_node)
7656     {
7657       /* Enforced above.  */
7658       unsigned int const_nunits = nunits.to_constant ();
7659
7660       /* Generate [VF*S, VF*S, ... ].  */
7661       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7662         {
7663           expr = build_int_cst (integer_type_node, vf);
7664           expr = fold_convert (TREE_TYPE (step_expr), expr);
7665         }
7666       else
7667         expr = build_int_cst (TREE_TYPE (step_expr), vf);
7668       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7669                               expr, step_expr);
7670       if (! CONSTANT_CLASS_P (new_name))
7671         new_name = vect_init_vector (stmt_info, new_name,
7672                                      TREE_TYPE (step_expr), NULL);
7673       new_vec = build_vector_from_val (vectype, new_name);
7674       vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7675
7676       /* Now generate the IVs.  */
7677       unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7678       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7679       unsigned elts = const_nunits * nvects;
7680       unsigned nivs = least_common_multiple (group_size,
7681                                              const_nunits) / const_nunits;
7682       gcc_assert (elts % group_size == 0);
7683       tree elt = init_expr;
7684       unsigned ivn;
7685       for (ivn = 0; ivn < nivs; ++ivn)
7686         {
7687           tree_vector_builder elts (vectype, const_nunits, 1);
7688           stmts = NULL;
7689           for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
7690             {
7691               if (ivn*const_nunits + eltn >= group_size
7692                   && (ivn * const_nunits + eltn) % group_size == 0)
7693                 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
7694                                     elt, step_expr);
7695               elts.quick_push (elt);
7696             }
7697           vec_init = gimple_build_vector (&stmts, &elts);
7698           if (stmts)
7699             {
7700               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7701               gcc_assert (!new_bb);
7702             }
7703
7704           /* Create the induction-phi that defines the induction-operand.  */
7705           vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7706           induction_phi = create_phi_node (vec_dest, iv_loop->header);
7707           stmt_vec_info induction_phi_info
7708             = loop_vinfo->add_stmt (induction_phi);
7709           induc_def = PHI_RESULT (induction_phi);
7710
7711           /* Create the iv update inside the loop  */
7712           vec_def = make_ssa_name (vec_dest);
7713           new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7714           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7715           loop_vinfo->add_stmt (new_stmt);
7716
7717           /* Set the arguments of the phi node:  */
7718           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7719           add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7720                        UNKNOWN_LOCATION);
7721
7722           SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi_info);
7723         }
7724
7725       /* Re-use IVs when we can.  */
7726       if (ivn < nvects)
7727         {
7728           unsigned vfp
7729             = least_common_multiple (group_size, const_nunits) / group_size;
7730           /* Generate [VF'*S, VF'*S, ... ].  */
7731           if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7732             {
7733               expr = build_int_cst (integer_type_node, vfp);
7734               expr = fold_convert (TREE_TYPE (step_expr), expr);
7735             }
7736           else
7737             expr = build_int_cst (TREE_TYPE (step_expr), vfp);
7738           new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7739                                   expr, step_expr);
7740           if (! CONSTANT_CLASS_P (new_name))
7741             new_name = vect_init_vector (stmt_info, new_name,
7742                                          TREE_TYPE (step_expr), NULL);
7743           new_vec = build_vector_from_val (vectype, new_name);
7744           vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7745           for (; ivn < nvects; ++ivn)
7746             {
7747               gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs]->stmt;
7748               tree def;
7749               if (gimple_code (iv) == GIMPLE_PHI)
7750                 def = gimple_phi_result (iv);
7751               else
7752                 def = gimple_assign_lhs (iv);
7753               new_stmt = gimple_build_assign (make_ssa_name (vectype),
7754                                               PLUS_EXPR,
7755                                               def, vec_step);
7756               if (gimple_code (iv) == GIMPLE_PHI)
7757                 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7758               else
7759                 {
7760                   gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
7761                   gsi_insert_after (&tgsi, new_stmt, GSI_CONTINUE_LINKING);
7762                 }
7763               SLP_TREE_VEC_STMTS (slp_node).quick_push
7764                 (loop_vinfo->add_stmt (new_stmt));
7765             }
7766         }
7767
7768       return true;
7769     }
7770
7771   /* Create the vector that holds the initial_value of the induction.  */
7772   if (nested_in_vect_loop)
7773     {
7774       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
7775          been created during vectorization of previous stmts.  We obtain it
7776          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
7777       vec_init = vect_get_vec_def_for_operand (init_expr, stmt_info);
7778       /* If the initial value is not of proper type, convert it.  */
7779       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
7780         {
7781           new_stmt
7782             = gimple_build_assign (vect_get_new_ssa_name (vectype,
7783                                                           vect_simple_var,
7784                                                           "vec_iv_"),
7785                                    VIEW_CONVERT_EXPR,
7786                                    build1 (VIEW_CONVERT_EXPR, vectype,
7787                                            vec_init));
7788           vec_init = gimple_assign_lhs (new_stmt);
7789           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
7790                                                  new_stmt);
7791           gcc_assert (!new_bb);
7792           loop_vinfo->add_stmt (new_stmt);
7793         }
7794     }
7795   else
7796     {
7797       /* iv_loop is the loop to be vectorized. Create:
7798          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
7799       stmts = NULL;
7800       new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
7801
7802       unsigned HOST_WIDE_INT const_nunits;
7803       if (nunits.is_constant (&const_nunits))
7804         {
7805           tree_vector_builder elts (vectype, const_nunits, 1);
7806           elts.quick_push (new_name);
7807           for (i = 1; i < const_nunits; i++)
7808             {
7809               /* Create: new_name_i = new_name + step_expr  */
7810               new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
7811                                        new_name, step_expr);
7812               elts.quick_push (new_name);
7813             }
7814           /* Create a vector from [new_name_0, new_name_1, ...,
7815              new_name_nunits-1]  */
7816           vec_init = gimple_build_vector (&stmts, &elts);
7817         }
7818       else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
7819         /* Build the initial value directly from a VEC_SERIES_EXPR.  */
7820         vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, vectype,
7821                                  new_name, step_expr);
7822       else
7823         {
7824           /* Build:
7825                 [base, base, base, ...]
7826                 + (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
7827           gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
7828           gcc_assert (flag_associative_math);
7829           tree index = build_index_vector (vectype, 0, 1);
7830           tree base_vec = gimple_build_vector_from_val (&stmts, vectype,
7831                                                         new_name);
7832           tree step_vec = gimple_build_vector_from_val (&stmts, vectype,
7833                                                         step_expr);
7834           vec_init = gimple_build (&stmts, FLOAT_EXPR, vectype, index);
7835           vec_init = gimple_build (&stmts, MULT_EXPR, vectype,
7836                                    vec_init, step_vec);
7837           vec_init = gimple_build (&stmts, PLUS_EXPR, vectype,
7838                                    vec_init, base_vec);
7839         }
7840
7841       if (stmts)
7842         {
7843           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7844           gcc_assert (!new_bb);
7845         }
7846     }
7847
7848
7849   /* Create the vector that holds the step of the induction.  */
7850   if (nested_in_vect_loop)
7851     /* iv_loop is nested in the loop to be vectorized. Generate:
7852        vec_step = [S, S, S, S]  */
7853     new_name = step_expr;
7854   else
7855     {
7856       /* iv_loop is the loop to be vectorized. Generate:
7857           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
7858       gimple_seq seq = NULL;
7859       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7860         {
7861           expr = build_int_cst (integer_type_node, vf);
7862           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7863         }
7864       else
7865         expr = build_int_cst (TREE_TYPE (step_expr), vf);
7866       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7867                                expr, step_expr);
7868       if (seq)
7869         {
7870           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7871           gcc_assert (!new_bb);
7872         }
7873     }
7874
7875   t = unshare_expr (new_name);
7876   gcc_assert (CONSTANT_CLASS_P (new_name)
7877               || TREE_CODE (new_name) == SSA_NAME);
7878   new_vec = build_vector_from_val (vectype, t);
7879   vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7880
7881
7882   /* Create the following def-use cycle:
7883      loop prolog:
7884          vec_init = ...
7885          vec_step = ...
7886      loop:
7887          vec_iv = PHI <vec_init, vec_loop>
7888          ...
7889          STMT
7890          ...
7891          vec_loop = vec_iv + vec_step;  */
7892
7893   /* Create the induction-phi that defines the induction-operand.  */
7894   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7895   induction_phi = create_phi_node (vec_dest, iv_loop->header);
7896   stmt_vec_info induction_phi_info = loop_vinfo->add_stmt (induction_phi);
7897   induc_def = PHI_RESULT (induction_phi);
7898
7899   /* Create the iv update inside the loop  */
7900   vec_def = make_ssa_name (vec_dest);
7901   new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7902   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7903   stmt_vec_info new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7904
7905   /* Set the arguments of the phi node:  */
7906   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7907   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7908                UNKNOWN_LOCATION);
7909
7910   STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi_info;
7911
7912   /* In case that vectorization factor (VF) is bigger than the number
7913      of elements that we can fit in a vectype (nunits), we have to generate
7914      more than one vector stmt - i.e - we need to "unroll" the
7915      vector stmt by a factor VF/nunits.  For more details see documentation
7916      in vectorizable_operation.  */
7917
7918   if (ncopies > 1)
7919     {
7920       gimple_seq seq = NULL;
7921       stmt_vec_info prev_stmt_vinfo;
7922       /* FORNOW. This restriction should be relaxed.  */
7923       gcc_assert (!nested_in_vect_loop);
7924
7925       /* Create the vector that holds the step of the induction.  */
7926       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7927         {
7928           expr = build_int_cst (integer_type_node, nunits);
7929           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7930         }
7931       else
7932         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
7933       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7934                                expr, step_expr);
7935       if (seq)
7936         {
7937           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7938           gcc_assert (!new_bb);
7939         }
7940
7941       t = unshare_expr (new_name);
7942       gcc_assert (CONSTANT_CLASS_P (new_name)
7943                   || TREE_CODE (new_name) == SSA_NAME);
7944       new_vec = build_vector_from_val (vectype, t);
7945       vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7946
7947       vec_def = induc_def;
7948       prev_stmt_vinfo = induction_phi_info;
7949       for (i = 1; i < ncopies; i++)
7950         {
7951           /* vec_i = vec_prev + vec_step  */
7952           new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
7953                                           vec_def, vec_step);
7954           vec_def = make_ssa_name (vec_dest, new_stmt);
7955           gimple_assign_set_lhs (new_stmt, vec_def);
7956
7957           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7958           new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7959           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt_info;
7960           prev_stmt_vinfo = new_stmt_info;
7961         }
7962     }
7963
7964   if (nested_in_vect_loop)
7965     {
7966       /* Find the loop-closed exit-phi of the induction, and record
7967          the final vector of induction results:  */
7968       exit_phi = NULL;
7969       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7970         {
7971           gimple *use_stmt = USE_STMT (use_p);
7972           if (is_gimple_debug (use_stmt))
7973             continue;
7974
7975           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
7976             {
7977               exit_phi = use_stmt;
7978               break;
7979             }
7980         }
7981       if (exit_phi)
7982         {
7983           stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7984           /* FORNOW. Currently not supporting the case that an inner-loop induction
7985              is not used in the outer-loop (i.e. only outside the outer-loop).  */
7986           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
7987                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
7988
7989           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt_info;
7990           if (dump_enabled_p ())
7991             dump_printf_loc (MSG_NOTE, vect_location,
7992                              "vector of inductions after inner-loop:%G",
7993                              new_stmt);
7994         }
7995     }
7996
7997
7998   if (dump_enabled_p ())
7999     dump_printf_loc (MSG_NOTE, vect_location,
8000                      "transform induction: created def-use cycle: %G%G",
8001                      induction_phi, SSA_NAME_DEF_STMT (vec_def));
8002
8003   return true;
8004 }
8005
8006 /* Function vectorizable_live_operation.
8007
8008    STMT_INFO computes a value that is used outside the loop.  Check if
8009    it can be supported.  */
8010
8011 bool
8012 vectorizable_live_operation (stmt_vec_info stmt_info,
8013                              gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
8014                              slp_tree slp_node, int slp_index,
8015                              stmt_vec_info *vec_stmt,
8016                              stmt_vector_for_cost *)
8017 {
8018   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
8019   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8020   imm_use_iterator imm_iter;
8021   tree lhs, lhs_type, bitsize, vec_bitsize;
8022   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8023   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8024   int ncopies;
8025   gimple *use_stmt;
8026   auto_vec<tree> vec_oprnds;
8027   int vec_entry = 0;
8028   poly_uint64 vec_index = 0;
8029
8030   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
8031
8032   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
8033     return false;
8034
8035   /* FORNOW.  CHECKME.  */
8036   if (nested_in_vect_loop_p (loop, stmt_info))
8037     return false;
8038
8039   /* If STMT is not relevant and it is a simple assignment and its inputs are
8040      invariant then it can remain in place, unvectorized.  The original last
8041      scalar value that it computes will be used.  */
8042   if (!STMT_VINFO_RELEVANT_P (stmt_info))
8043     {
8044       gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
8045       if (dump_enabled_p ())
8046         dump_printf_loc (MSG_NOTE, vect_location,
8047                          "statement is simple and uses invariant.  Leaving in "
8048                          "place.\n");
8049       return true;
8050     }
8051
8052   if (slp_node)
8053     ncopies = 1;
8054   else
8055     ncopies = vect_get_num_copies (loop_vinfo, vectype);
8056
8057   if (slp_node)
8058     {
8059       gcc_assert (slp_index >= 0);
8060
8061       int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
8062       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8063
8064       /* Get the last occurrence of the scalar index from the concatenation of
8065          all the slp vectors. Calculate which slp vector it is and the index
8066          within.  */
8067       poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
8068
8069       /* Calculate which vector contains the result, and which lane of
8070          that vector we need.  */
8071       if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
8072         {
8073           if (dump_enabled_p ())
8074             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8075                              "Cannot determine which vector holds the"
8076                              " final result.\n");
8077           return false;
8078         }
8079     }
8080
8081   if (!vec_stmt)
8082     {
8083       /* No transformation required.  */
8084       if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
8085         {
8086           if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
8087                                                OPTIMIZE_FOR_SPEED))
8088             {
8089               if (dump_enabled_p ())
8090                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8091                                  "can't use a fully-masked loop because "
8092                                  "the target doesn't support extract last "
8093                                  "reduction.\n");
8094               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
8095             }
8096           else if (slp_node)
8097             {
8098               if (dump_enabled_p ())
8099                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8100                                  "can't use a fully-masked loop because an "
8101                                  "SLP statement is live after the loop.\n");
8102               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
8103             }
8104           else if (ncopies > 1)
8105             {
8106               if (dump_enabled_p ())
8107                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8108                                  "can't use a fully-masked loop because"
8109                                  " ncopies is greater than 1.\n");
8110               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
8111             }
8112           else
8113             {
8114               gcc_assert (ncopies == 1 && !slp_node);
8115               vect_record_loop_mask (loop_vinfo,
8116                                      &LOOP_VINFO_MASKS (loop_vinfo),
8117                                      1, vectype);
8118             }
8119         }
8120       return true;
8121     }
8122
8123   /* Use the lhs of the original scalar statement.  */
8124   gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
8125
8126   lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
8127         : gimple_get_lhs (stmt);
8128   lhs_type = TREE_TYPE (lhs);
8129
8130   bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype)
8131              ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype)))
8132              : TYPE_SIZE (TREE_TYPE (vectype)));
8133   vec_bitsize = TYPE_SIZE (vectype);
8134
8135   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
8136   tree vec_lhs, bitstart;
8137   if (slp_node)
8138     {
8139       gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8140
8141       /* Get the correct slp vectorized stmt.  */
8142       gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry]->stmt;
8143       if (gphi *phi = dyn_cast <gphi *> (vec_stmt))
8144         vec_lhs = gimple_phi_result (phi);
8145       else
8146         vec_lhs = gimple_get_lhs (vec_stmt);
8147
8148       /* Get entry to use.  */
8149       bitstart = bitsize_int (vec_index);
8150       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
8151     }
8152   else
8153     {
8154       enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
8155       vec_lhs = vect_get_vec_def_for_operand_1 (stmt_info, dt);
8156       gcc_checking_assert (ncopies == 1
8157                            || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8158
8159       /* For multiple copies, get the last copy.  */
8160       for (int i = 1; i < ncopies; ++i)
8161         vec_lhs = vect_get_vec_def_for_stmt_copy (loop_vinfo, vec_lhs);
8162
8163       /* Get the last lane in the vector.  */
8164       bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
8165     }
8166
8167   gimple_seq stmts = NULL;
8168   tree new_tree;
8169   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8170     {
8171       /* Emit:
8172
8173            SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
8174
8175          where VEC_LHS is the vectorized live-out result and MASK is
8176          the loop mask for the final iteration.  */
8177       gcc_assert (ncopies == 1 && !slp_node);
8178       tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
8179       tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
8180                                       1, vectype, 0);
8181       tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST,
8182                                       scalar_type, mask, vec_lhs);
8183
8184       /* Convert the extracted vector element to the required scalar type.  */
8185       new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
8186     }
8187   else
8188     {
8189       tree bftype = TREE_TYPE (vectype);
8190       if (VECTOR_BOOLEAN_TYPE_P (vectype))
8191         bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8192       new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs, bitsize, bitstart);
8193       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8194                                        &stmts, true, NULL_TREE);
8195     }
8196
8197   if (stmts)
8198     gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts);
8199
8200   /* Replace use of lhs with newly computed result.  If the use stmt is a
8201      single arg PHI, just replace all uses of PHI result.  It's necessary
8202      because lcssa PHI defining lhs may be before newly inserted stmt.  */
8203   use_operand_p use_p;
8204   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8205     if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
8206         && !is_gimple_debug (use_stmt))
8207     {
8208       if (gimple_code (use_stmt) == GIMPLE_PHI
8209           && gimple_phi_num_args (use_stmt) == 1)
8210         {
8211           replace_uses_by (gimple_phi_result (use_stmt), new_tree);
8212         }
8213       else
8214         {
8215           FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8216             SET_USE (use_p, new_tree);
8217         }
8218       update_stmt (use_stmt);
8219     }
8220
8221   return true;
8222 }
8223
8224 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO.  */
8225
8226 static void
8227 vect_loop_kill_debug_uses (struct loop *loop, stmt_vec_info stmt_info)
8228 {
8229   ssa_op_iter op_iter;
8230   imm_use_iterator imm_iter;
8231   def_operand_p def_p;
8232   gimple *ustmt;
8233
8234   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
8235     {
8236       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
8237         {
8238           basic_block bb;
8239
8240           if (!is_gimple_debug (ustmt))
8241             continue;
8242
8243           bb = gimple_bb (ustmt);
8244
8245           if (!flow_bb_inside_loop_p (loop, bb))
8246             {
8247               if (gimple_debug_bind_p (ustmt))
8248                 {
8249                   if (dump_enabled_p ())
8250                     dump_printf_loc (MSG_NOTE, vect_location,
8251                                      "killing debug use\n");
8252
8253                   gimple_debug_bind_reset_value (ustmt);
8254                   update_stmt (ustmt);
8255                 }
8256               else
8257                 gcc_unreachable ();
8258             }
8259         }
8260     }
8261 }
8262
8263 /* Given loop represented by LOOP_VINFO, return true if computation of
8264    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
8265    otherwise.  */
8266
8267 static bool
8268 loop_niters_no_overflow (loop_vec_info loop_vinfo)
8269 {
8270   /* Constant case.  */
8271   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8272     {
8273       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
8274       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
8275
8276       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
8277       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
8278       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
8279         return true;
8280     }
8281
8282   widest_int max;
8283   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8284   /* Check the upper bound of loop niters.  */
8285   if (get_max_loop_iterations (loop, &max))
8286     {
8287       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
8288       signop sgn = TYPE_SIGN (type);
8289       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
8290       if (max < type_max)
8291         return true;
8292     }
8293   return false;
8294 }
8295
8296 /* Return a mask type with half the number of elements as TYPE.  */
8297
8298 tree
8299 vect_halve_mask_nunits (tree type)
8300 {
8301   poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (type), 2);
8302   return build_truth_vector_type (nunits, current_vector_size);
8303 }
8304
8305 /* Return a mask type with twice as many elements as TYPE.  */
8306
8307 tree
8308 vect_double_mask_nunits (tree type)
8309 {
8310   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (type) * 2;
8311   return build_truth_vector_type (nunits, current_vector_size);
8312 }
8313
8314 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8315    contain a sequence of NVECTORS masks that each control a vector of type
8316    VECTYPE.  */
8317
8318 void
8319 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
8320                        unsigned int nvectors, tree vectype)
8321 {
8322   gcc_assert (nvectors != 0);
8323   if (masks->length () < nvectors)
8324     masks->safe_grow_cleared (nvectors);
8325   rgroup_masks *rgm = &(*masks)[nvectors - 1];
8326   /* The number of scalars per iteration and the number of vectors are
8327      both compile-time constants.  */
8328   unsigned int nscalars_per_iter
8329     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8330                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8331   if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8332     {
8333       rgm->max_nscalars_per_iter = nscalars_per_iter;
8334       rgm->mask_type = build_same_sized_truth_vector_type (vectype);
8335     }
8336 }
8337
8338 /* Given a complete set of masks MASKS, extract mask number INDEX
8339    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8340    where 0 <= INDEX < NVECTORS.  Insert any set-up statements before GSI.
8341
8342    See the comment above vec_loop_masks for more details about the mask
8343    arrangement.  */
8344
8345 tree
8346 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
8347                     unsigned int nvectors, tree vectype, unsigned int index)
8348 {
8349   rgroup_masks *rgm = &(*masks)[nvectors - 1];
8350   tree mask_type = rgm->mask_type;
8351
8352   /* Populate the rgroup's mask array, if this is the first time we've
8353      used it.  */
8354   if (rgm->masks.is_empty ())
8355     {
8356       rgm->masks.safe_grow_cleared (nvectors);
8357       for (unsigned int i = 0; i < nvectors; ++i)
8358         {
8359           tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
8360           /* Provide a dummy definition until the real one is available.  */
8361           SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
8362           rgm->masks[i] = mask;
8363         }
8364     }
8365
8366   tree mask = rgm->masks[index];
8367   if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
8368                 TYPE_VECTOR_SUBPARTS (vectype)))
8369     {
8370       /* A loop mask for data type X can be reused for data type Y
8371          if X has N times more elements than Y and if Y's elements
8372          are N times bigger than X's.  In this case each sequence
8373          of N elements in the loop mask will be all-zero or all-one.
8374          We can then view-convert the mask so that each sequence of
8375          N elements is replaced by a single element.  */
8376       gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
8377                               TYPE_VECTOR_SUBPARTS (vectype)));
8378       gimple_seq seq = NULL;
8379       mask_type = build_same_sized_truth_vector_type (vectype);
8380       mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
8381       if (seq)
8382         gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
8383     }
8384   return mask;
8385 }
8386
8387 /* Scale profiling counters by estimation for LOOP which is vectorized
8388    by factor VF.  */
8389
8390 static void
8391 scale_profile_for_vect_loop (struct loop *loop, unsigned vf)
8392 {
8393   edge preheader = loop_preheader_edge (loop);
8394   /* Reduce loop iterations by the vectorization factor.  */
8395   gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
8396   profile_count freq_h = loop->header->count, freq_e = preheader->count ();
8397
8398   if (freq_h.nonzero_p ())
8399     {
8400       profile_probability p;
8401
8402       /* Avoid dropping loop body profile counter to 0 because of zero count
8403          in loop's preheader.  */
8404       if (!(freq_e == profile_count::zero ()))
8405         freq_e = freq_e.force_nonzero ();
8406       p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
8407       scale_loop_frequencies (loop, p);
8408     }
8409
8410   edge exit_e = single_exit (loop);
8411   exit_e->probability = profile_probability::always ()
8412                                  .apply_scale (1, new_est_niter + 1);
8413
8414   edge exit_l = single_pred_edge (loop->latch);
8415   profile_probability prob = exit_l->probability;
8416   exit_l->probability = exit_e->probability.invert ();
8417   if (prob.initialized_p () && exit_l->probability.initialized_p ())
8418     scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
8419 }
8420
8421 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
8422    When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
8423    stmt_vec_info.  */
8424
8425 static void
8426 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8427                           gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
8428 {
8429   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8430   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8431
8432   if (dump_enabled_p ())
8433     dump_printf_loc (MSG_NOTE, vect_location,
8434                      "------>vectorizing statement: %G", stmt_info->stmt);
8435
8436   if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8437     vect_loop_kill_debug_uses (loop, stmt_info);
8438
8439   if (!STMT_VINFO_RELEVANT_P (stmt_info)
8440       && !STMT_VINFO_LIVE_P (stmt_info))
8441     return;
8442
8443   if (STMT_VINFO_VECTYPE (stmt_info))
8444     {
8445       poly_uint64 nunits
8446         = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8447       if (!STMT_SLP_TYPE (stmt_info)
8448           && maybe_ne (nunits, vf)
8449           && dump_enabled_p ())
8450         /* For SLP VF is set according to unrolling factor, and not
8451            to vector size, hence for SLP this print is not valid.  */
8452         dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8453     }
8454
8455   /* Pure SLP statements have already been vectorized.  We still need
8456      to apply loop vectorization to hybrid SLP statements.  */
8457   if (PURE_SLP_STMT (stmt_info))
8458     return;
8459
8460   if (dump_enabled_p ())
8461     dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
8462
8463   if (vect_transform_stmt (stmt_info, gsi, NULL, NULL))
8464     *seen_store = stmt_info;
8465 }
8466
8467 /* Function vect_transform_loop.
8468
8469    The analysis phase has determined that the loop is vectorizable.
8470    Vectorize the loop - created vectorized stmts to replace the scalar
8471    stmts in the loop, and update the loop exit condition.
8472    Returns scalar epilogue loop if any.  */
8473
8474 struct loop *
8475 vect_transform_loop (loop_vec_info loop_vinfo)
8476 {
8477   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8478   struct loop *epilogue = NULL;
8479   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8480   int nbbs = loop->num_nodes;
8481   int i;
8482   tree niters_vector = NULL_TREE;
8483   tree step_vector = NULL_TREE;
8484   tree niters_vector_mult_vf = NULL_TREE;
8485   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8486   unsigned int lowest_vf = constant_lower_bound (vf);
8487   gimple *stmt;
8488   bool check_profitability = false;
8489   unsigned int th;
8490
8491   DUMP_VECT_SCOPE ("vec_transform_loop");
8492
8493   loop_vinfo->shared->check_datarefs ();
8494
8495   /* Use the more conservative vectorization threshold.  If the number
8496      of iterations is constant assume the cost check has been performed
8497      by our caller.  If the threshold makes all loops profitable that
8498      run at least the (estimated) vectorization factor number of times
8499      checking is pointless, too.  */
8500   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
8501   if (th >= vect_vf_for_cost (loop_vinfo)
8502       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8503     {
8504       if (dump_enabled_p ())
8505         dump_printf_loc (MSG_NOTE, vect_location,
8506                          "Profitability threshold is %d loop iterations.\n",
8507                          th);
8508       check_profitability = true;
8509     }
8510
8511   /* Make sure there exists a single-predecessor exit bb.  Do this before
8512      versioning.   */
8513   edge e = single_exit (loop);
8514   if (! single_pred_p (e->dest))
8515     {
8516       split_loop_exit_edge (e, true);
8517       if (dump_enabled_p ())
8518         dump_printf (MSG_NOTE, "split exit edge\n");
8519     }
8520
8521   /* Version the loop first, if required, so the profitability check
8522      comes first.  */
8523
8524   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
8525     {
8526       poly_uint64 versioning_threshold
8527         = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
8528       if (check_profitability
8529           && ordered_p (poly_uint64 (th), versioning_threshold))
8530         {
8531           versioning_threshold = ordered_max (poly_uint64 (th),
8532                                               versioning_threshold);
8533           check_profitability = false;
8534         }
8535       struct loop *sloop
8536         = vect_loop_versioning (loop_vinfo, th, check_profitability,
8537                                 versioning_threshold);
8538       sloop->force_vectorize = false;
8539       check_profitability = false;
8540     }
8541
8542   /* Make sure there exists a single-predecessor exit bb also on the
8543      scalar loop copy.  Do this after versioning but before peeling
8544      so CFG structure is fine for both scalar and if-converted loop
8545      to make slpeel_duplicate_current_defs_from_edges face matched
8546      loop closed PHI nodes on the exit.  */
8547   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8548     {
8549       e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
8550       if (! single_pred_p (e->dest))
8551         {
8552           split_loop_exit_edge (e, true);
8553           if (dump_enabled_p ())
8554             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
8555         }
8556     }
8557
8558   tree niters = vect_build_loop_niters (loop_vinfo);
8559   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
8560   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
8561   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
8562   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
8563                               &step_vector, &niters_vector_mult_vf, th,
8564                               check_profitability, niters_no_overflow);
8565
8566   if (niters_vector == NULL_TREE)
8567     {
8568       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8569           && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8570           && known_eq (lowest_vf, vf))
8571         {
8572           niters_vector
8573             = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
8574                              LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
8575           step_vector = build_one_cst (TREE_TYPE (niters));
8576         }
8577       else
8578         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
8579                                      &step_vector, niters_no_overflow);
8580     }
8581
8582   /* 1) Make sure the loop header has exactly two entries
8583      2) Make sure we have a preheader basic block.  */
8584
8585   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
8586
8587   split_edge (loop_preheader_edge (loop));
8588
8589   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8590       && vect_use_loop_mask_for_alignment_p (loop_vinfo))
8591     /* This will deal with any possible peeling.  */
8592     vect_prepare_for_masked_peels (loop_vinfo);
8593
8594   /* Schedule the SLP instances first, then handle loop vectorization
8595      below.  */
8596   if (!loop_vinfo->slp_instances.is_empty ())
8597     {
8598       DUMP_VECT_SCOPE ("scheduling SLP instances");
8599       vect_schedule_slp (loop_vinfo);
8600     }
8601
8602   /* FORNOW: the vectorizer supports only loops which body consist
8603      of one basic block (header + empty latch). When the vectorizer will
8604      support more involved loop forms, the order by which the BBs are
8605      traversed need to be reconsidered.  */
8606
8607   for (i = 0; i < nbbs; i++)
8608     {
8609       basic_block bb = bbs[i];
8610       stmt_vec_info stmt_info;
8611
8612       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
8613            gsi_next (&si))
8614         {
8615           gphi *phi = si.phi ();
8616           if (dump_enabled_p ())
8617             dump_printf_loc (MSG_NOTE, vect_location,
8618                              "------>vectorizing phi: %G", phi);
8619           stmt_info = loop_vinfo->lookup_stmt (phi);
8620           if (!stmt_info)
8621             continue;
8622
8623           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8624             vect_loop_kill_debug_uses (loop, stmt_info);
8625
8626           if (!STMT_VINFO_RELEVANT_P (stmt_info)
8627               && !STMT_VINFO_LIVE_P (stmt_info))
8628             continue;
8629
8630           if (STMT_VINFO_VECTYPE (stmt_info)
8631               && (maybe_ne
8632                   (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
8633               && dump_enabled_p ())
8634             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8635
8636           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
8637                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8638                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
8639               && ! PURE_SLP_STMT (stmt_info))
8640             {
8641               if (dump_enabled_p ())
8642                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
8643               vect_transform_stmt (stmt_info, NULL, NULL, NULL);
8644             }
8645         }
8646
8647       for (gimple_stmt_iterator si = gsi_start_bb (bb);
8648            !gsi_end_p (si);)
8649         {
8650           stmt = gsi_stmt (si);
8651           /* During vectorization remove existing clobber stmts.  */
8652           if (gimple_clobber_p (stmt))
8653             {
8654               unlink_stmt_vdef (stmt);
8655               gsi_remove (&si, true);
8656               release_defs (stmt);
8657             }
8658           else
8659             {
8660               stmt_info = loop_vinfo->lookup_stmt (stmt);
8661
8662               /* vector stmts created in the outer-loop during vectorization of
8663                  stmts in an inner-loop may not have a stmt_info, and do not
8664                  need to be vectorized.  */
8665               stmt_vec_info seen_store = NULL;
8666               if (stmt_info)
8667                 {
8668                   if (STMT_VINFO_IN_PATTERN_P (stmt_info))
8669                     {
8670                       gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
8671                       for (gimple_stmt_iterator subsi = gsi_start (def_seq);
8672                            !gsi_end_p (subsi); gsi_next (&subsi))
8673                         {
8674                           stmt_vec_info pat_stmt_info
8675                             = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
8676                           vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
8677                                                     &si, &seen_store);
8678                         }
8679                       stmt_vec_info pat_stmt_info
8680                         = STMT_VINFO_RELATED_STMT (stmt_info);
8681                       vect_transform_loop_stmt (loop_vinfo, pat_stmt_info, &si,
8682                                                 &seen_store);
8683                     }
8684                   vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
8685                                             &seen_store);
8686                 }
8687               gsi_next (&si);
8688               if (seen_store)
8689                 {
8690                   if (STMT_VINFO_GROUPED_ACCESS (seen_store))
8691                     /* Interleaving.  If IS_STORE is TRUE, the
8692                        vectorization of the interleaving chain was
8693                        completed - free all the stores in the chain.  */
8694                     vect_remove_stores (DR_GROUP_FIRST_ELEMENT (seen_store));
8695                   else
8696                     /* Free the attached stmt_vec_info and remove the stmt.  */
8697                     loop_vinfo->remove_stmt (stmt_info);
8698                 }
8699             }
8700         }
8701
8702       /* Stub out scalar statements that must not survive vectorization.
8703          Doing this here helps with grouped statements, or statements that
8704          are involved in patterns.  */
8705       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
8706            !gsi_end_p (gsi); gsi_next (&gsi))
8707         {
8708           gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
8709           if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
8710             {
8711               tree lhs = gimple_get_lhs (call);
8712               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8713                 {
8714                   tree zero = build_zero_cst (TREE_TYPE (lhs));
8715                   gimple *new_stmt = gimple_build_assign (lhs, zero);
8716                   gsi_replace (&gsi, new_stmt, true);
8717                 }
8718             }
8719         }
8720     }                           /* BBs in loop */
8721
8722   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
8723      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
8724   if (integer_onep (step_vector))
8725     niters_no_overflow = true;
8726   vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
8727                            niters_vector_mult_vf, !niters_no_overflow);
8728
8729   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
8730   scale_profile_for_vect_loop (loop, assumed_vf);
8731
8732   /* True if the final iteration might not handle a full vector's
8733      worth of scalar iterations.  */
8734   bool final_iter_may_be_partial = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8735   /* The minimum number of iterations performed by the epilogue.  This
8736      is 1 when peeling for gaps because we always need a final scalar
8737      iteration.  */
8738   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
8739   /* +1 to convert latch counts to loop iteration counts,
8740      -min_epilogue_iters to remove iterations that cannot be performed
8741        by the vector code.  */
8742   int bias_for_lowest = 1 - min_epilogue_iters;
8743   int bias_for_assumed = bias_for_lowest;
8744   int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
8745   if (alignment_npeels && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8746     {
8747       /* When the amount of peeling is known at compile time, the first
8748          iteration will have exactly alignment_npeels active elements.
8749          In the worst case it will have at least one.  */
8750       int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
8751       bias_for_lowest += lowest_vf - min_first_active;
8752       bias_for_assumed += assumed_vf - min_first_active;
8753     }
8754   /* In these calculations the "- 1" converts loop iteration counts
8755      back to latch counts.  */
8756   if (loop->any_upper_bound)
8757     loop->nb_iterations_upper_bound
8758       = (final_iter_may_be_partial
8759          ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
8760                           lowest_vf) - 1
8761          : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
8762                            lowest_vf) - 1);
8763   if (loop->any_likely_upper_bound)
8764     loop->nb_iterations_likely_upper_bound
8765       = (final_iter_may_be_partial
8766          ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
8767                           + bias_for_lowest, lowest_vf) - 1
8768          : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
8769                            + bias_for_lowest, lowest_vf) - 1);
8770   if (loop->any_estimate)
8771     loop->nb_iterations_estimate
8772       = (final_iter_may_be_partial
8773          ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
8774                           assumed_vf) - 1
8775          : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
8776                            assumed_vf) - 1);
8777
8778   if (dump_enabled_p ())
8779     {
8780       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8781         {
8782           dump_printf_loc (MSG_NOTE, vect_location,
8783                            "LOOP VECTORIZED\n");
8784           if (loop->inner)
8785             dump_printf_loc (MSG_NOTE, vect_location,
8786                              "OUTER LOOP VECTORIZED\n");
8787           dump_printf (MSG_NOTE, "\n");
8788         }
8789       else
8790         {
8791           dump_printf_loc (MSG_NOTE, vect_location,
8792                            "LOOP EPILOGUE VECTORIZED (VS=");
8793           dump_dec (MSG_NOTE, current_vector_size);
8794           dump_printf (MSG_NOTE, ")\n");
8795         }
8796     }
8797
8798   /* Loops vectorized with a variable factor won't benefit from
8799      unrolling/peeling.  */
8800   if (!vf.is_constant ())
8801     {
8802       loop->unroll = 1;
8803       if (dump_enabled_p ())
8804         dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
8805                          " variable-length vectorization factor\n");
8806     }
8807   /* Free SLP instances here because otherwise stmt reference counting
8808      won't work.  */
8809   slp_instance instance;
8810   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
8811     vect_free_slp_instance (instance, true);
8812   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
8813   /* Clear-up safelen field since its value is invalid after vectorization
8814      since vectorized loop can have loop-carried dependencies.  */
8815   loop->safelen = 0;
8816
8817   /* Don't vectorize epilogue for epilogue.  */
8818   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8819     epilogue = NULL;
8820
8821   if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
8822     epilogue = NULL;
8823
8824   if (epilogue)
8825     {
8826       auto_vector_sizes vector_sizes;
8827       targetm.vectorize.autovectorize_vector_sizes (&vector_sizes, false);
8828       unsigned int next_size = 0;
8829
8830       /* Note LOOP_VINFO_NITERS_KNOWN_P and LOOP_VINFO_INT_NITERS work
8831          on niters already ajusted for the iterations of the prologue.  */
8832       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8833           && known_eq (vf, lowest_vf))
8834         {
8835           unsigned HOST_WIDE_INT eiters
8836             = (LOOP_VINFO_INT_NITERS (loop_vinfo)
8837                - LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
8838           eiters
8839             = eiters % lowest_vf + LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
8840           epilogue->nb_iterations_upper_bound = eiters - 1;
8841           epilogue->any_upper_bound = true;
8842
8843           unsigned int ratio;
8844           while (next_size < vector_sizes.length ()
8845                  && !(constant_multiple_p (current_vector_size,
8846                                            vector_sizes[next_size], &ratio)
8847                       && eiters >= lowest_vf / ratio))
8848             next_size += 1;
8849         }
8850       else
8851         while (next_size < vector_sizes.length ()
8852                && maybe_lt (current_vector_size, vector_sizes[next_size]))
8853           next_size += 1;
8854
8855       if (next_size == vector_sizes.length ())
8856         epilogue = NULL;
8857     }
8858
8859   if (epilogue)
8860     {
8861       epilogue->force_vectorize = loop->force_vectorize;
8862       epilogue->safelen = loop->safelen;
8863       epilogue->dont_vectorize = false;
8864
8865       /* We may need to if-convert epilogue to vectorize it.  */
8866       if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8867         tree_if_conversion (epilogue);
8868     }
8869
8870   return epilogue;
8871 }
8872
8873 /* The code below is trying to perform simple optimization - revert
8874    if-conversion for masked stores, i.e. if the mask of a store is zero
8875    do not perform it and all stored value producers also if possible.
8876    For example,
8877      for (i=0; i<n; i++)
8878        if (c[i])
8879         {
8880           p1[i] += 1;
8881           p2[i] = p3[i] +2;
8882         }
8883    this transformation will produce the following semi-hammock:
8884
8885    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
8886      {
8887        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
8888        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
8889        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
8890        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
8891        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
8892        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
8893      }
8894 */
8895
8896 void
8897 optimize_mask_stores (struct loop *loop)
8898 {
8899   basic_block *bbs = get_loop_body (loop);
8900   unsigned nbbs = loop->num_nodes;
8901   unsigned i;
8902   basic_block bb;
8903   struct loop *bb_loop;
8904   gimple_stmt_iterator gsi;
8905   gimple *stmt;
8906   auto_vec<gimple *> worklist;
8907   auto_purge_vect_location sentinel;
8908
8909   vect_location = find_loop_location (loop);
8910   /* Pick up all masked stores in loop if any.  */
8911   for (i = 0; i < nbbs; i++)
8912     {
8913       bb = bbs[i];
8914       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
8915            gsi_next (&gsi))
8916         {
8917           stmt = gsi_stmt (gsi);
8918           if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
8919             worklist.safe_push (stmt);
8920         }
8921     }
8922
8923   free (bbs);
8924   if (worklist.is_empty ())
8925     return;
8926
8927   /* Loop has masked stores.  */
8928   while (!worklist.is_empty ())
8929     {
8930       gimple *last, *last_store;
8931       edge e, efalse;
8932       tree mask;
8933       basic_block store_bb, join_bb;
8934       gimple_stmt_iterator gsi_to;
8935       tree vdef, new_vdef;
8936       gphi *phi;
8937       tree vectype;
8938       tree zero;
8939
8940       last = worklist.pop ();
8941       mask = gimple_call_arg (last, 2);
8942       bb = gimple_bb (last);
8943       /* Create then_bb and if-then structure in CFG, then_bb belongs to
8944          the same loop as if_bb.  It could be different to LOOP when two
8945          level loop-nest is vectorized and mask_store belongs to the inner
8946          one.  */
8947       e = split_block (bb, last);
8948       bb_loop = bb->loop_father;
8949       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
8950       join_bb = e->dest;
8951       store_bb = create_empty_bb (bb);
8952       add_bb_to_loop (store_bb, bb_loop);
8953       e->flags = EDGE_TRUE_VALUE;
8954       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
8955       /* Put STORE_BB to likely part.  */
8956       efalse->probability = profile_probability::unlikely ();
8957       store_bb->count = efalse->count ();
8958       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
8959       if (dom_info_available_p (CDI_DOMINATORS))
8960         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
8961       if (dump_enabled_p ())
8962         dump_printf_loc (MSG_NOTE, vect_location,
8963                          "Create new block %d to sink mask stores.",
8964                          store_bb->index);
8965       /* Create vector comparison with boolean result.  */
8966       vectype = TREE_TYPE (mask);
8967       zero = build_zero_cst (vectype);
8968       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
8969       gsi = gsi_last_bb (bb);
8970       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
8971       /* Create new PHI node for vdef of the last masked store:
8972          .MEM_2 = VDEF <.MEM_1>
8973          will be converted to
8974          .MEM.3 = VDEF <.MEM_1>
8975          and new PHI node will be created in join bb
8976          .MEM_2 = PHI <.MEM_1, .MEM_3>
8977       */
8978       vdef = gimple_vdef (last);
8979       new_vdef = make_ssa_name (gimple_vop (cfun), last);
8980       gimple_set_vdef (last, new_vdef);
8981       phi = create_phi_node (vdef, join_bb);
8982       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
8983
8984       /* Put all masked stores with the same mask to STORE_BB if possible.  */
8985       while (true)
8986         {
8987           gimple_stmt_iterator gsi_from;
8988           gimple *stmt1 = NULL;
8989
8990           /* Move masked store to STORE_BB.  */
8991           last_store = last;
8992           gsi = gsi_for_stmt (last);
8993           gsi_from = gsi;
8994           /* Shift GSI to the previous stmt for further traversal.  */
8995           gsi_prev (&gsi);
8996           gsi_to = gsi_start_bb (store_bb);
8997           gsi_move_before (&gsi_from, &gsi_to);
8998           /* Setup GSI_TO to the non-empty block start.  */
8999           gsi_to = gsi_start_bb (store_bb);
9000           if (dump_enabled_p ())
9001             dump_printf_loc (MSG_NOTE, vect_location,
9002                              "Move stmt to created bb\n%G", last);
9003           /* Move all stored value producers if possible.  */
9004           while (!gsi_end_p (gsi))
9005             {
9006               tree lhs;
9007               imm_use_iterator imm_iter;
9008               use_operand_p use_p;
9009               bool res;
9010
9011               /* Skip debug statements.  */
9012               if (is_gimple_debug (gsi_stmt (gsi)))
9013                 {
9014                   gsi_prev (&gsi);
9015                   continue;
9016                 }
9017               stmt1 = gsi_stmt (gsi);
9018               /* Do not consider statements writing to memory or having
9019                  volatile operand.  */
9020               if (gimple_vdef (stmt1)
9021                   || gimple_has_volatile_ops (stmt1))
9022                 break;
9023               gsi_from = gsi;
9024               gsi_prev (&gsi);
9025               lhs = gimple_get_lhs (stmt1);
9026               if (!lhs)
9027                 break;
9028
9029               /* LHS of vectorized stmt must be SSA_NAME.  */
9030               if (TREE_CODE (lhs) != SSA_NAME)
9031                 break;
9032
9033               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9034                 {
9035                   /* Remove dead scalar statement.  */
9036                   if (has_zero_uses (lhs))
9037                     {
9038                       gsi_remove (&gsi_from, true);
9039                       continue;
9040                     }
9041                 }
9042
9043               /* Check that LHS does not have uses outside of STORE_BB.  */
9044               res = true;
9045               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
9046                 {
9047                   gimple *use_stmt;
9048                   use_stmt = USE_STMT (use_p);
9049                   if (is_gimple_debug (use_stmt))
9050                     continue;
9051                   if (gimple_bb (use_stmt) != store_bb)
9052                     {
9053                       res = false;
9054                       break;
9055                     }
9056                 }
9057               if (!res)
9058                 break;
9059
9060               if (gimple_vuse (stmt1)
9061                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
9062                 break;
9063
9064               /* Can move STMT1 to STORE_BB.  */
9065               if (dump_enabled_p ())
9066                 dump_printf_loc (MSG_NOTE, vect_location,
9067                                  "Move stmt to created bb\n%G", stmt1);
9068               gsi_move_before (&gsi_from, &gsi_to);
9069               /* Shift GSI_TO for further insertion.  */
9070               gsi_prev (&gsi_to);
9071             }
9072           /* Put other masked stores with the same mask to STORE_BB.  */
9073           if (worklist.is_empty ()
9074               || gimple_call_arg (worklist.last (), 2) != mask
9075               || worklist.last () != stmt1)
9076             break;
9077           last = worklist.pop ();
9078         }
9079       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
9080     }
9081 }
9082
9083 /* Decide whether it is possible to use a zero-based induction variable
9084    when vectorizing LOOP_VINFO with a fully-masked loop.  If it is,
9085    return the value that the induction variable must be able to hold
9086    in order to ensure that the loop ends with an all-false mask.
9087    Return -1 otherwise.  */
9088 widest_int
9089 vect_iv_limit_for_full_masking (loop_vec_info loop_vinfo)
9090 {
9091   tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9092   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9093   unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
9094
9095   /* Calculate the value that the induction variable must be able
9096      to hit in order to ensure that we end the loop with an all-false mask.
9097      This involves adding the maximum number of inactive trailing scalar
9098      iterations.  */
9099   widest_int iv_limit = -1;
9100   if (max_loop_iterations (loop, &iv_limit))
9101     {
9102       if (niters_skip)
9103         {
9104           /* Add the maximum number of skipped iterations to the
9105              maximum iteration count.  */
9106           if (TREE_CODE (niters_skip) == INTEGER_CST)
9107             iv_limit += wi::to_widest (niters_skip);
9108           else
9109             iv_limit += max_vf - 1;
9110         }
9111       else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
9112         /* Make a conservatively-correct assumption.  */
9113         iv_limit += max_vf - 1;
9114
9115       /* IV_LIMIT is the maximum number of latch iterations, which is also
9116          the maximum in-range IV value.  Round this value down to the previous
9117          vector alignment boundary and then add an extra full iteration.  */
9118       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9119       iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
9120     }
9121   return iv_limit;
9122 }
9123