gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2019 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "cfghooks.h"
  31 #include "tree-pass.h"
  32 #include "ssa.h"
  33 #include "optabs-tree.h"
  34 #include "diagnostic-core.h"
  35 #include "fold-const.h"
  36 #include "stor-layout.h"
  37 #include "cfganal.h"
  38 #include "gimplify.h"
  39 #include "gimple-iterator.h"
  40 #include "gimplify-me.h"
  41 #include "tree-ssa-loop-ivopts.h"
  42 #include "tree-ssa-loop-manip.h"
  43 #include "tree-ssa-loop-niter.h"
  44 #include "tree-ssa-loop.h"
  45 #include "cfgloop.h"
  46 #include "params.h"
  47 #include "tree-scalar-evolution.h"
  48 #include "tree-vectorizer.h"
  49 #include "gimple-fold.h"
  50 #include "cgraph.h"
  51 #include "tree-cfg.h"
  52 #include "tree-if-conv.h"
  53 #include "internal-fn.h"
  54 #include "tree-vector-builder.h"
  55 #include "vec-perm-indices.h"
  56 #include "tree-eh.h"
  57
  58 /* Loop Vectorization Pass.
  59
  60    This pass tries to vectorize loops.
  61
  62    For example, the vectorizer transforms the following simple loop:
  63
  64         short a[N]; short b[N]; short c[N]; int i;
  65
  66         for (i=0; i<N; i++){
  67           a[i] = b[i] + c[i];
  68         }
  69
  70    as if it was manually vectorized by rewriting the source code into:
  71
  72         typedef int __attribute__((mode(V8HI))) v8hi;
  73         short a[N];  short b[N]; short c[N];   int i;
  74         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  75         v8hi va, vb, vc;
  76
  77         for (i=0; i<N/8; i++){
  78           vb = pb[i];
  79           vc = pc[i];
  80           va = vb + vc;
  81           pa[i] = va;
  82         }
  83
  84         The main entry to this pass is vectorize_loops(), in which
  85    the vectorizer applies a set of analyses on a given set of loops,
  86    followed by the actual vectorization transformation for the loops that
  87    had successfully passed the analysis phase.
  88         Throughout this pass we make a distinction between two types of
  89    data: scalars (which are represented by SSA_NAMES), and memory references
  90    ("data-refs").  These two types of data require different handling both
  91    during analysis and transformation. The types of data-refs that the
  92    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  93    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  94    accesses are required to have a simple (consecutive) access pattern.
  95
  96    Analysis phase:
  97    ===============
  98         The driver for the analysis phase is vect_analyze_loop().
  99    It applies a set of analyses, some of which rely on the scalar evolution
 100    analyzer (scev) developed by Sebastian Pop.
 101
 102         During the analysis phase the vectorizer records some information
 103    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 104    loop, as well as general information about the loop as a whole, which is
 105    recorded in a "loop_vec_info" struct attached to each loop.
 106
 107    Transformation phase:
 108    =====================
 109         The loop transformation phase scans all the stmts in the loop, and
 110    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 111    the loop that needs to be vectorized.  It inserts the vector code sequence
 112    just before the scalar stmt S, and records a pointer to the vector code
 113    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 114    attached to S).  This pointer will be used for the vectorization of following
 115    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 116    otherwise, we rely on dead code elimination for removing it.
 117
 118         For example, say stmt S1 was vectorized into stmt VS1:
 119
 120    VS1: vb = px[i];
 121    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 122    S2:  a = b;
 123
 124    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 125    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 126    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 127    resulting sequence would be:
 128
 129    VS1: vb = px[i];
 130    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 131    VS2: va = vb;
 132    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 133
 134         Operands that are not SSA_NAMEs, are data-refs that appear in
 135    load/store operations (like 'x[i]' in S1), and are handled differently.
 136
 137    Target modeling:
 138    =================
 139         Currently the only target specific information that is used is the
 140    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 141    Targets that can support different sizes of vectors, for now will need
 142    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 143    flexibility will be added in the future.
 144
 145         Since we only vectorize operations which vector form can be
 146    expressed using existing tree codes, to verify that an operation is
 147    supported, the vectorizer checks the relevant optab at the relevant
 148    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 149    the value found is CODE_FOR_nothing, then there's no target support, and
 150    we can't vectorize the stmt.
 151
 152    For additional information on this project see:
 153    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 154 */
 155
 156 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 157
 158 /* Subroutine of vect_determine_vf_for_stmt that handles only one
 159    statement.  VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
 160    may already be set for general statements (not just data refs).  */
 161
 162 static opt_result
 163 vect_determine_vf_for_stmt_1 (stmt_vec_info stmt_info,
 164                               bool vectype_maybe_set_p,
 165                               poly_uint64 *vf,
 166                               vec<stmt_vec_info > *mask_producers)
 167 {
 168   gimple *stmt = stmt_info->stmt;
 169
 170   if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 171        && !STMT_VINFO_LIVE_P (stmt_info))
 172       || gimple_clobber_p (stmt))
 173     {
 174       if (dump_enabled_p ())
 175         dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 176       return opt_result::success ();
 177     }
 178
 179   tree stmt_vectype, nunits_vectype;
 180   opt_result res = vect_get_vector_types_for_stmt (stmt_info, &stmt_vectype,
 181                                                    &nunits_vectype);
 182   if (!res)
 183     return res;
 184
 185   if (stmt_vectype)
 186     {
 187       if (STMT_VINFO_VECTYPE (stmt_info))
 188         /* The only case when a vectype had been already set is for stmts
 189            that contain a data ref, or for "pattern-stmts" (stmts generated
 190            by the vectorizer to represent/replace a certain idiom).  */
 191         gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
 192                      || vectype_maybe_set_p)
 193                     && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
 194       else if (stmt_vectype == boolean_type_node)
 195         mask_producers->safe_push (stmt_info);
 196       else
 197         STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
 198     }
 199
 200   if (nunits_vectype)
 201     vect_update_max_nunits (vf, nunits_vectype);
 202
 203   return opt_result::success ();
 204 }
 205
 206 /* Subroutine of vect_determine_vectorization_factor.  Set the vector
 207    types of STMT_INFO and all attached pattern statements and update
 208    the vectorization factor VF accordingly.  If some of the statements
 209    produce a mask result whose vector type can only be calculated later,
 210    add them to MASK_PRODUCERS.  Return true on success or false if
 211    something prevented vectorization.  */
 212
 213 static opt_result
 214 vect_determine_vf_for_stmt (stmt_vec_info stmt_info, poly_uint64 *vf,
 215                             vec<stmt_vec_info > *mask_producers)
 216 {
 217   vec_info *vinfo = stmt_info->vinfo;
 218   if (dump_enabled_p ())
 219     dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
 220                      stmt_info->stmt);
 221   opt_result res
 222     = vect_determine_vf_for_stmt_1 (stmt_info, false, vf, mask_producers);
 223   if (!res)
 224     return res;
 225
 226   if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 227       && STMT_VINFO_RELATED_STMT (stmt_info))
 228     {
 229       gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 230       stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
 231
 232       /* If a pattern statement has def stmts, analyze them too.  */
 233       for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
 234            !gsi_end_p (si); gsi_next (&si))
 235         {
 236           stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
 237           if (dump_enabled_p ())
 238             dump_printf_loc (MSG_NOTE, vect_location,
 239                              "==> examining pattern def stmt: %G",
 240                              def_stmt_info->stmt);
 241           if (!vect_determine_vf_for_stmt_1 (def_stmt_info, true,
 242                                              vf, mask_producers))
 243           res = vect_determine_vf_for_stmt_1 (def_stmt_info, true,
 244                                               vf, mask_producers);
 245           if (!res)
 246             return res;
 247         }
 248
 249       if (dump_enabled_p ())
 250         dump_printf_loc (MSG_NOTE, vect_location,
 251                          "==> examining pattern statement: %G",
 252                          stmt_info->stmt);
 253       res = vect_determine_vf_for_stmt_1 (stmt_info, true, vf, mask_producers);
 254       if (!res)
 255         return res;
 256     }
 257
 258   return opt_result::success ();
 259 }
 260
 261 /* Function vect_determine_vectorization_factor
 262
 263    Determine the vectorization factor (VF).  VF is the number of data elements
 264    that are operated upon in parallel in a single iteration of the vectorized
 265    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 266    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 267    elements can fit in a single vector register.
 268
 269    We currently support vectorization of loops in which all types operated upon
 270    are of the same size.  Therefore this function currently sets VF according to
 271    the size of the types operated upon, and fails if there are multiple sizes
 272    in the loop.
 273
 274    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 275    original loop:
 276         for (i=0; i<N; i++){
 277           a[i] = b[i] + c[i];
 278         }
 279
 280    vectorized loop:
 281         for (i=0; i<N; i+=VF){
 282           a[i:VF] = b[i:VF] + c[i:VF];
 283         }
 284 */
 285
 286 static opt_result
 287 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 288 {
 289   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 290   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 291   unsigned nbbs = loop->num_nodes;
 292   poly_uint64 vectorization_factor = 1;
 293   tree scalar_type = NULL_TREE;
 294   gphi *phi;
 295   tree vectype;
 296   stmt_vec_info stmt_info;
 297   unsigned i;
 298   auto_vec<stmt_vec_info> mask_producers;
 299
 300   DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
 301
 302   for (i = 0; i < nbbs; i++)
 303     {
 304       basic_block bb = bbs[i];
 305
 306       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 307            gsi_next (&si))
 308         {
 309           phi = si.phi ();
 310           stmt_info = loop_vinfo->lookup_stmt (phi);
 311           if (dump_enabled_p ())
 312             dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
 313                              phi);
 314
 315           gcc_assert (stmt_info);
 316
 317           if (STMT_VINFO_RELEVANT_P (stmt_info)
 318               || STMT_VINFO_LIVE_P (stmt_info))
 319             {
 320               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 321               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 322
 323               if (dump_enabled_p ())
 324                 dump_printf_loc (MSG_NOTE, vect_location,
 325                                  "get vectype for scalar type:  %T\n",
 326                                  scalar_type);
 327
 328               vectype = get_vectype_for_scalar_type (scalar_type);
 329               if (!vectype)
 330                 return opt_result::failure_at (phi,
 331                                                "not vectorized: unsupported "
 332                                                "data-type %T\n",
 333                                                scalar_type);
 334               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 335
 336               if (dump_enabled_p ())
 337                 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
 338                                  vectype);
 339
 340               if (dump_enabled_p ())
 341                 {
 342                   dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
 343                   dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
 344                   dump_printf (MSG_NOTE, "\n");
 345                 }
 346
 347               vect_update_max_nunits (&vectorization_factor, vectype);
 348             }
 349         }
 350
 351       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
 352            gsi_next (&si))
 353         {
 354           stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
 355           opt_result res
 356             = vect_determine_vf_for_stmt (stmt_info, &vectorization_factor,
 357                                           &mask_producers);
 358           if (!res)
 359             return res;
 360         }
 361     }
 362
 363   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 364   if (dump_enabled_p ())
 365     {
 366       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
 367       dump_dec (MSG_NOTE, vectorization_factor);
 368       dump_printf (MSG_NOTE, "\n");
 369     }
 370
 371   if (known_le (vectorization_factor, 1U))
 372     return opt_result::failure_at (vect_location,
 373                                    "not vectorized: unsupported data-type\n");
 374   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 375
 376   for (i = 0; i < mask_producers.length (); i++)
 377     {
 378       stmt_info = mask_producers[i];
 379       opt_tree mask_type = vect_get_mask_type_for_stmt (stmt_info);
 380       if (!mask_type)
 381         return opt_result::propagate_failure (mask_type);
 382       STMT_VINFO_VECTYPE (stmt_info) = mask_type;
 383     }
 384
 385   return opt_result::success ();
 386 }
 387
 388
 389 /* Function vect_is_simple_iv_evolution.
 390
 391    FORNOW: A simple evolution of an induction variables in the loop is
 392    considered a polynomial evolution.  */
 393
 394 static bool
 395 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 396                              tree * step)
 397 {
 398   tree init_expr;
 399   tree step_expr;
 400   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 401   basic_block bb;
 402
 403   /* When there is no evolution in this loop, the evolution function
 404      is not "simple".  */
 405   if (evolution_part == NULL_TREE)
 406     return false;
 407
 408   /* When the evolution is a polynomial of degree >= 2
 409      the evolution function is not "simple".  */
 410   if (tree_is_chrec (evolution_part))
 411     return false;
 412
 413   step_expr = evolution_part;
 414   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 415
 416   if (dump_enabled_p ())
 417     dump_printf_loc (MSG_NOTE, vect_location, "step: %T,  init: %T\n",
 418                      step_expr, init_expr);
 419
 420   *init = init_expr;
 421   *step = step_expr;
 422
 423   if (TREE_CODE (step_expr) != INTEGER_CST
 424       && (TREE_CODE (step_expr) != SSA_NAME
 425           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 426               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 427           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 428               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 429                   || !flag_associative_math)))
 430       && (TREE_CODE (step_expr) != REAL_CST
 431           || !flag_associative_math))
 432     {
 433       if (dump_enabled_p ())
 434         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 435                          "step unknown.\n");
 436       return false;
 437     }
 438
 439   return true;
 440 }
 441
 442 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
 443    what we are assuming is a double reduction.  For example, given
 444    a structure like this:
 445
 446       outer1:
 447         x_1 = PHI <x_4(outer2), ...>;
 448         ...
 449
 450       inner:
 451         x_2 = PHI <x_1(outer1), ...>;
 452         ...
 453         x_3 = ...;
 454         ...
 455
 456       outer2:
 457         x_4 = PHI <x_3(inner)>;
 458         ...
 459
 460    outer loop analysis would treat x_1 as a double reduction phi and
 461    this function would then return true for x_2.  */
 462
 463 static bool
 464 vect_inner_phi_in_double_reduction_p (stmt_vec_info stmt_info, gphi *phi)
 465 {
 466   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
 467   use_operand_p use_p;
 468   ssa_op_iter op_iter;
 469   FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
 470     if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
 471       if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
 472         return true;
 473   return false;
 474 }
 475
 476 /* Function vect_analyze_scalar_cycles_1.
 477
 478    Examine the cross iteration def-use cycles of scalar variables
 479    in LOOP.  LOOP_VINFO represents the loop that is now being
 480    considered for vectorization (can be LOOP, or an outer-loop
 481    enclosing LOOP).  */
 482
 483 static void
 484 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 485 {
 486   basic_block bb = loop->header;
 487   tree init, step;
 488   auto_vec<stmt_vec_info, 64> worklist;
 489   gphi_iterator gsi;
 490   bool double_reduc;
 491
 492   DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
 493
 494   /* First - identify all inductions.  Reduction detection assumes that all the
 495      inductions have been identified, therefore, this order must not be
 496      changed.  */
 497   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 498     {
 499       gphi *phi = gsi.phi ();
 500       tree access_fn = NULL;
 501       tree def = PHI_RESULT (phi);
 502       stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
 503
 504       if (dump_enabled_p ())
 505         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
 506
 507       /* Skip virtual phi's.  The data dependences that are associated with
 508          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 509       if (virtual_operand_p (def))
 510         continue;
 511
 512       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 513
 514       /* Analyze the evolution function.  */
 515       access_fn = analyze_scalar_evolution (loop, def);
 516       if (access_fn)
 517         {
 518           STRIP_NOPS (access_fn);
 519           if (dump_enabled_p ())
 520             dump_printf_loc (MSG_NOTE, vect_location,
 521                              "Access function of PHI: %T\n", access_fn);
 522           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 523             = initial_condition_in_loop_num (access_fn, loop->num);
 524           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 525             = evolution_part_in_loop_num (access_fn, loop->num);
 526         }
 527
 528       if (!access_fn
 529           || vect_inner_phi_in_double_reduction_p (stmt_vinfo, phi)
 530           || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
 531           || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 532               && TREE_CODE (step) != INTEGER_CST))
 533         {
 534           worklist.safe_push (stmt_vinfo);
 535           continue;
 536         }
 537
 538       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 539                   != NULL_TREE);
 540       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 541
 542       if (dump_enabled_p ())
 543         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 544       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 545     }
 546
 547
 548   /* Second - identify all reductions and nested cycles.  */
 549   while (worklist.length () > 0)
 550     {
 551       stmt_vec_info stmt_vinfo = worklist.pop ();
 552       gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
 553       tree def = PHI_RESULT (phi);
 554
 555       if (dump_enabled_p ())
 556         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
 557
 558       gcc_assert (!virtual_operand_p (def)
 559                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 560
 561       stmt_vec_info reduc_stmt_info
 562         = vect_force_simple_reduction (loop_vinfo, stmt_vinfo,
 563                                        &double_reduc, false);
 564       if (reduc_stmt_info)
 565         {
 566           if (double_reduc)
 567             {
 568               if (dump_enabled_p ())
 569                 dump_printf_loc (MSG_NOTE, vect_location,
 570                                  "Detected double reduction.\n");
 571
 572               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 573               STMT_VINFO_DEF_TYPE (reduc_stmt_info)
 574                 = vect_double_reduction_def;
 575             }
 576           else
 577             {
 578               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
 579                 {
 580                   if (dump_enabled_p ())
 581                     dump_printf_loc (MSG_NOTE, vect_location,
 582                                      "Detected vectorizable nested cycle.\n");
 583
 584                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 585                   STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_nested_cycle;
 586                 }
 587               else
 588                 {
 589                   if (dump_enabled_p ())
 590                     dump_printf_loc (MSG_NOTE, vect_location,
 591                                      "Detected reduction.\n");
 592
 593                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 594                   STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
 595                   /* Store the reduction cycles for possible vectorization in
 596                      loop-aware SLP if it was not detected as reduction
 597                      chain.  */
 598                   if (! REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
 599                     LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
 600                       (reduc_stmt_info);
 601                 }
 602             }
 603         }
 604       else
 605         if (dump_enabled_p ())
 606           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 607                            "Unknown def-use cycle pattern.\n");
 608     }
 609 }
 610
 611
 612 /* Function vect_analyze_scalar_cycles.
 613
 614    Examine the cross iteration def-use cycles of scalar variables, by
 615    analyzing the loop-header PHIs of scalar variables.  Classify each
 616    cycle as one of the following: invariant, induction, reduction, unknown.
 617    We do that for the loop represented by LOOP_VINFO, and also to its
 618    inner-loop, if exists.
 619    Examples for scalar cycles:
 620
 621    Example1: reduction:
 622
 623               loop1:
 624               for (i=0; i<N; i++)
 625                  sum += a[i];
 626
 627    Example2: induction:
 628
 629               loop2:
 630               for (i=0; i<N; i++)
 631                  a[i] = i;  */
 632
 633 static void
 634 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 635 {
 636   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 637
 638   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 639
 640   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 641      Reductions in such inner-loop therefore have different properties than
 642      the reductions in the nest that gets vectorized:
 643      1. When vectorized, they are executed in the same order as in the original
 644         scalar loop, so we can't change the order of computation when
 645         vectorizing them.
 646      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 647         current checks are too strict.  */
 648
 649   if (loop->inner)
 650     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 651 }
 652
 653 /* Transfer group and reduction information from STMT_INFO to its
 654    pattern stmt.  */
 655
 656 static void
 657 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
 658 {
 659   stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
 660   stmt_vec_info stmtp;
 661   gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
 662               && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
 663   REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
 664   do
 665     {
 666       stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
 667       REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
 668       stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
 669       if (stmt_info)
 670         REDUC_GROUP_NEXT_ELEMENT (stmtp)
 671           = STMT_VINFO_RELATED_STMT (stmt_info);
 672     }
 673   while (stmt_info);
 674   STMT_VINFO_DEF_TYPE (stmtp) = vect_reduction_def;
 675 }
 676
 677 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 678
 679 static void
 680 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 681 {
 682   stmt_vec_info first;
 683   unsigned i;
 684
 685   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 686     if (STMT_VINFO_IN_PATTERN_P (first))
 687       {
 688         stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
 689         while (next)
 690           {
 691             if (! STMT_VINFO_IN_PATTERN_P (next))
 692               break;
 693             next = REDUC_GROUP_NEXT_ELEMENT (next);
 694           }
 695         /* If not all stmt in the chain are patterns try to handle
 696            the chain without patterns.  */
 697         if (! next)
 698           {
 699             vect_fixup_reduc_chain (first);
 700             LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
 701               = STMT_VINFO_RELATED_STMT (first);
 702           }
 703       }
 704 }
 705
 706 /* Function vect_get_loop_niters.
 707
 708    Determine how many iterations the loop is executed and place it
 709    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
 710    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
 711    niter information holds in ASSUMPTIONS.
 712
 713    Return the loop exit condition.  */
 714
 715
 716 static gcond *
 717 vect_get_loop_niters (struct loop *loop, tree *assumptions,
 718                       tree *number_of_iterations, tree *number_of_iterationsm1)
 719 {
 720   edge exit = single_exit (loop);
 721   struct tree_niter_desc niter_desc;
 722   tree niter_assumptions, niter, may_be_zero;
 723   gcond *cond = get_loop_exit_condition (loop);
 724
 725   *assumptions = boolean_true_node;
 726   *number_of_iterationsm1 = chrec_dont_know;
 727   *number_of_iterations = chrec_dont_know;
 728   DUMP_VECT_SCOPE ("get_loop_niters");
 729
 730   if (!exit)
 731     return cond;
 732
 733   niter = chrec_dont_know;
 734   may_be_zero = NULL_TREE;
 735   niter_assumptions = boolean_true_node;
 736   if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
 737       || chrec_contains_undetermined (niter_desc.niter))
 738     return cond;
 739
 740   niter_assumptions = niter_desc.assumptions;
 741   may_be_zero = niter_desc.may_be_zero;
 742   niter = niter_desc.niter;
 743
 744   if (may_be_zero && integer_zerop (may_be_zero))
 745     may_be_zero = NULL_TREE;
 746
 747   if (may_be_zero)
 748     {
 749       if (COMPARISON_CLASS_P (may_be_zero))
 750         {
 751           /* Try to combine may_be_zero with assumptions, this can simplify
 752              computation of niter expression.  */
 753           if (niter_assumptions && !integer_nonzerop (niter_assumptions))
 754             niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
 755                                              niter_assumptions,
 756                                              fold_build1 (TRUTH_NOT_EXPR,
 757                                                           boolean_type_node,
 758                                                           may_be_zero));
 759           else
 760             niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
 761                                  build_int_cst (TREE_TYPE (niter), 0),
 762                                  rewrite_to_non_trapping_overflow (niter));
 763
 764           may_be_zero = NULL_TREE;
 765         }
 766       else if (integer_nonzerop (may_be_zero))
 767         {
 768           *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
 769           *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
 770           return cond;
 771         }
 772       else
 773         return cond;
 774     }
 775
 776   *assumptions = niter_assumptions;
 777   *number_of_iterationsm1 = niter;
 778
 779   /* We want the number of loop header executions which is the number
 780      of latch executions plus one.
 781      ???  For UINT_MAX latch executions this number overflows to zero
 782      for loops like do { n++; } while (n != 0);  */
 783   if (niter && !chrec_contains_undetermined (niter))
 784     niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
 785                           build_int_cst (TREE_TYPE (niter), 1));
 786   *number_of_iterations = niter;
 787
 788   return cond;
 789 }
 790
 791 /* Function bb_in_loop_p
 792
 793    Used as predicate for dfs order traversal of the loop bbs.  */
 794
 795 static bool
 796 bb_in_loop_p (const_basic_block bb, const void *data)
 797 {
 798   const struct loop *const loop = (const struct loop *)data;
 799   if (flow_bb_inside_loop_p (loop, bb))
 800     return true;
 801   return false;
 802 }
 803
 804
 805 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
 806    stmt_vec_info structs for all the stmts in LOOP_IN.  */
 807
 808 _loop_vec_info::_loop_vec_info (struct loop *loop_in, vec_info_shared *shared)
 809   : vec_info (vec_info::loop, init_cost (loop_in), shared),
 810     loop (loop_in),
 811     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
 812     num_itersm1 (NULL_TREE),
 813     num_iters (NULL_TREE),
 814     num_iters_unchanged (NULL_TREE),
 815     num_iters_assumptions (NULL_TREE),
 816     th (0),
 817     versioning_threshold (0),
 818     vectorization_factor (0),
 819     max_vectorization_factor (0),
 820     mask_skip_niters (NULL_TREE),
 821     mask_compare_type (NULL_TREE),
 822     simd_if_cond (NULL_TREE),
 823     unaligned_dr (NULL),
 824     peeling_for_alignment (0),
 825     ptr_mask (0),
 826     ivexpr_map (NULL),
 827     slp_unrolling_factor (1),
 828     single_scalar_iteration_cost (0),
 829     vectorizable (false),
 830     can_fully_mask_p (true),
 831     fully_masked_p (false),
 832     peeling_for_gaps (false),
 833     peeling_for_niter (false),
 834     operands_swapped (false),
 835     no_data_dependencies (false),
 836     has_mask_store (false),
 837     scalar_loop (NULL),
 838     orig_loop_info (NULL)
 839 {
 840   /* CHECKME: We want to visit all BBs before their successors (except for
 841      latch blocks, for which this assertion wouldn't hold).  In the simple
 842      case of the loop forms we allow, a dfs order of the BBs would the same
 843      as reversed postorder traversal, so we are safe.  */
 844
 845   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 846                                           bbs, loop->num_nodes, loop);
 847   gcc_assert (nbbs == loop->num_nodes);
 848
 849   for (unsigned int i = 0; i < nbbs; i++)
 850     {
 851       basic_block bb = bbs[i];
 852       gimple_stmt_iterator si;
 853
 854       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 855         {
 856           gimple *phi = gsi_stmt (si);
 857           gimple_set_uid (phi, 0);
 858           add_stmt (phi);
 859         }
 860
 861       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 862         {
 863           gimple *stmt = gsi_stmt (si);
 864           gimple_set_uid (stmt, 0);
 865           add_stmt (stmt);
 866           /* If .GOMP_SIMD_LANE call for the current loop has 2 arguments, the
 867              second argument is the #pragma omp simd if (x) condition, when 0,
 868              loop shouldn't be vectorized, when non-zero constant, it should
 869              be vectorized normally, otherwise versioned with vectorized loop
 870              done if the condition is non-zero at runtime.  */
 871           if (loop_in->simduid
 872               && is_gimple_call (stmt)
 873               && gimple_call_internal_p (stmt)
 874               && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
 875               && gimple_call_num_args (stmt) >= 2
 876               && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
 877               && (loop_in->simduid
 878                   == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
 879             {
 880               tree arg = gimple_call_arg (stmt, 1);
 881               if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
 882                 simd_if_cond = arg;
 883               else
 884                 gcc_assert (integer_nonzerop (arg));
 885             }
 886         }
 887     }
 888 }
 889
 890 /* Free all levels of MASKS.  */
 891
 892 void
 893 release_vec_loop_masks (vec_loop_masks *masks)
 894 {
 895   rgroup_masks *rgm;
 896   unsigned int i;
 897   FOR_EACH_VEC_ELT (*masks, i, rgm)
 898     rgm->masks.release ();
 899   masks->release ();
 900 }
 901
 902 /* Free all memory used by the _loop_vec_info, as well as all the
 903    stmt_vec_info structs of all the stmts in the loop.  */
 904
 905 _loop_vec_info::~_loop_vec_info ()
 906 {
 907   int nbbs;
 908   gimple_stmt_iterator si;
 909   int j;
 910
 911   nbbs = loop->num_nodes;
 912   for (j = 0; j < nbbs; j++)
 913     {
 914       basic_block bb = bbs[j];
 915       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
 916         {
 917           gimple *stmt = gsi_stmt (si);
 918
 919           /* We may have broken canonical form by moving a constant
 920              into RHS1 of a commutative op.  Fix such occurrences.  */
 921           if (operands_swapped && is_gimple_assign (stmt))
 922             {
 923               enum tree_code code = gimple_assign_rhs_code (stmt);
 924
 925               if ((code == PLUS_EXPR
 926                    || code == POINTER_PLUS_EXPR
 927                    || code == MULT_EXPR)
 928                   && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
 929                 swap_ssa_operands (stmt,
 930                                    gimple_assign_rhs1_ptr (stmt),
 931                                    gimple_assign_rhs2_ptr (stmt));
 932               else if (code == COND_EXPR
 933                        && CONSTANT_CLASS_P (gimple_assign_rhs2 (stmt)))
 934                 {
 935                   tree cond_expr = gimple_assign_rhs1 (stmt);
 936                   enum tree_code cond_code = TREE_CODE (cond_expr);
 937
 938                   if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
 939                     {
 940                       bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr,
 941                                                                   0));
 942                       cond_code = invert_tree_comparison (cond_code,
 943                                                           honor_nans);
 944                       if (cond_code != ERROR_MARK)
 945                         {
 946                           TREE_SET_CODE (cond_expr, cond_code);
 947                           swap_ssa_operands (stmt,
 948                                              gimple_assign_rhs2_ptr (stmt),
 949                                              gimple_assign_rhs3_ptr (stmt));
 950                         }
 951                     }
 952                 }
 953             }
 954           gsi_next (&si);
 955         }
 956     }
 957
 958   free (bbs);
 959
 960   release_vec_loop_masks (&masks);
 961   delete ivexpr_map;
 962
 963   loop->aux = NULL;
 964 }
 965
 966 /* Return an invariant or register for EXPR and emit necessary
 967    computations in the LOOP_VINFO loop preheader.  */
 968
 969 tree
 970 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
 971 {
 972   if (is_gimple_reg (expr)
 973       || is_gimple_min_invariant (expr))
 974     return expr;
 975
 976   if (! loop_vinfo->ivexpr_map)
 977     loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
 978   tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
 979   if (! cached)
 980     {
 981       gimple_seq stmts = NULL;
 982       cached = force_gimple_operand (unshare_expr (expr),
 983                                      &stmts, true, NULL_TREE);
 984       if (stmts)
 985         {
 986           edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
 987           gsi_insert_seq_on_edge_immediate (e, stmts);
 988         }
 989     }
 990   return cached;
 991 }
 992
 993 /* Return true if we can use CMP_TYPE as the comparison type to produce
 994    all masks required to mask LOOP_VINFO.  */
 995
 996 static bool
 997 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
 998 {
 999   rgroup_masks *rgm;
1000   unsigned int i;
1001   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1002     if (rgm->mask_type != NULL_TREE
1003         && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
1004                                             cmp_type, rgm->mask_type,
1005                                             OPTIMIZE_FOR_SPEED))
1006       return false;
1007   return true;
1008 }
1009
1010 /* Calculate the maximum number of scalars per iteration for every
1011    rgroup in LOOP_VINFO.  */
1012
1013 static unsigned int
1014 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1015 {
1016   unsigned int res = 1;
1017   unsigned int i;
1018   rgroup_masks *rgm;
1019   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1020     res = MAX (res, rgm->max_nscalars_per_iter);
1021   return res;
1022 }
1023
1024 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1025    whether we can actually generate the masks required.  Return true if so,
1026    storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE.  */
1027
1028 static bool
1029 vect_verify_full_masking (loop_vec_info loop_vinfo)
1030 {
1031   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1032   unsigned int min_ni_width;
1033
1034   /* Use a normal loop if there are no statements that need masking.
1035      This only happens in rare degenerate cases: it means that the loop
1036      has no loads, no stores, and no live-out values.  */
1037   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1038     return false;
1039
1040   /* Get the maximum number of iterations that is representable
1041      in the counter type.  */
1042   tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1043   widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1044
1045   /* Get a more refined estimate for the number of iterations.  */
1046   widest_int max_back_edges;
1047   if (max_loop_iterations (loop, &max_back_edges))
1048     max_ni = wi::smin (max_ni, max_back_edges + 1);
1049
1050   /* Account for rgroup masks, in which each bit is replicated N times.  */
1051   max_ni *= vect_get_max_nscalars_per_iter (loop_vinfo);
1052
1053   /* Work out how many bits we need to represent the limit.  */
1054   min_ni_width = wi::min_precision (max_ni, UNSIGNED);
1055
1056   /* Find a scalar mode for which WHILE_ULT is supported.  */
1057   opt_scalar_int_mode cmp_mode_iter;
1058   tree cmp_type = NULL_TREE;
1059   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1060     {
1061       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1062       if (cmp_bits >= min_ni_width
1063           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1064         {
1065           tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1066           if (this_type
1067               && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1068             {
1069               /* Although we could stop as soon as we find a valid mode,
1070                  it's often better to continue until we hit Pmode, since the
1071                  operands to the WHILE are more likely to be reusable in
1072                  address calculations.  */
1073               cmp_type = this_type;
1074               if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1075                 break;
1076             }
1077         }
1078     }
1079
1080   if (!cmp_type)
1081     return false;
1082
1083   LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
1084   return true;
1085 }
1086
1087 /* Calculate the cost of one scalar iteration of the loop.  */
1088 static void
1089 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1090 {
1091   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1092   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1093   int nbbs = loop->num_nodes, factor;
1094   int innerloop_iters, i;
1095
1096   DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1097
1098   /* Gather costs for statements in the scalar loop.  */
1099
1100   /* FORNOW.  */
1101   innerloop_iters = 1;
1102   if (loop->inner)
1103     innerloop_iters = 50; /* FIXME */
1104
1105   for (i = 0; i < nbbs; i++)
1106     {
1107       gimple_stmt_iterator si;
1108       basic_block bb = bbs[i];
1109
1110       if (bb->loop_father == loop->inner)
1111         factor = innerloop_iters;
1112       else
1113         factor = 1;
1114
1115       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1116         {
1117           gimple *stmt = gsi_stmt (si);
1118           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1119
1120           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1121             continue;
1122
1123           /* Skip stmts that are not vectorized inside the loop.  */
1124           stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1125           if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1126               && (!STMT_VINFO_LIVE_P (vstmt_info)
1127                   || !VECTORIZABLE_CYCLE_DEF
1128                         (STMT_VINFO_DEF_TYPE (vstmt_info))))
1129             continue;
1130
1131           vect_cost_for_stmt kind;
1132           if (STMT_VINFO_DATA_REF (stmt_info))
1133             {
1134               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1135                kind = scalar_load;
1136              else
1137                kind = scalar_store;
1138             }
1139           else
1140             kind = scalar_stmt;
1141
1142           record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1143                             factor, kind, stmt_info, 0, vect_prologue);
1144         }
1145     }
1146
1147   /* Now accumulate cost.  */
1148   void *target_cost_data = init_cost (loop);
1149   stmt_info_for_cost *si;
1150   int j;
1151   FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1152                     j, si)
1153     (void) add_stmt_cost (target_cost_data, si->count,
1154                           si->kind, si->stmt_info, si->misalign,
1155                           vect_body);
1156   unsigned dummy, body_cost = 0;
1157   finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
1158   destroy_cost_data (target_cost_data);
1159   LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
1160 }
1161
1162
1163 /* Function vect_analyze_loop_form_1.
1164
1165    Verify that certain CFG restrictions hold, including:
1166    - the loop has a pre-header
1167    - the loop has a single entry and exit
1168    - the loop exit condition is simple enough
1169    - the number of iterations can be analyzed, i.e, a countable loop.  The
1170      niter could be analyzed under some assumptions.  */
1171
1172 opt_result
1173 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1174                           tree *assumptions, tree *number_of_iterationsm1,
1175                           tree *number_of_iterations, gcond **inner_loop_cond)
1176 {
1177   DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1178
1179   /* Different restrictions apply when we are considering an inner-most loop,
1180      vs. an outer (nested) loop.
1181      (FORNOW. May want to relax some of these restrictions in the future).  */
1182
1183   if (!loop->inner)
1184     {
1185       /* Inner-most loop.  We currently require that the number of BBs is
1186          exactly 2 (the header and latch).  Vectorizable inner-most loops
1187          look like this:
1188
1189                         (pre-header)
1190                            |
1191                           header <--------+
1192                            | |            |
1193                            | +--> latch --+
1194                            |
1195                         (exit-bb)  */
1196
1197       if (loop->num_nodes != 2)
1198         return opt_result::failure_at (vect_location,
1199                                        "not vectorized:"
1200                                        " control flow in loop.\n");
1201
1202       if (empty_block_p (loop->header))
1203         return opt_result::failure_at (vect_location,
1204                                        "not vectorized: empty loop.\n");
1205     }
1206   else
1207     {
1208       struct loop *innerloop = loop->inner;
1209       edge entryedge;
1210
1211       /* Nested loop. We currently require that the loop is doubly-nested,
1212          contains a single inner loop, and the number of BBs is exactly 5.
1213          Vectorizable outer-loops look like this:
1214
1215                         (pre-header)
1216                            |
1217                           header <---+
1218                            |         |
1219                           inner-loop |
1220                            |         |
1221                           tail ------+
1222                            |
1223                         (exit-bb)
1224
1225          The inner-loop has the properties expected of inner-most loops
1226          as described above.  */
1227
1228       if ((loop->inner)->inner || (loop->inner)->next)
1229         return opt_result::failure_at (vect_location,
1230                                        "not vectorized:"
1231                                        " multiple nested loops.\n");
1232
1233       if (loop->num_nodes != 5)
1234         return opt_result::failure_at (vect_location,
1235                                        "not vectorized:"
1236                                        " control flow in loop.\n");
1237
1238       entryedge = loop_preheader_edge (innerloop);
1239       if (entryedge->src != loop->header
1240           || !single_exit (innerloop)
1241           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1242         return opt_result::failure_at (vect_location,
1243                                        "not vectorized:"
1244                                        " unsupported outerloop form.\n");
1245
1246       /* Analyze the inner-loop.  */
1247       tree inner_niterm1, inner_niter, inner_assumptions;
1248       opt_result res
1249         = vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1250                                     &inner_assumptions, &inner_niterm1,
1251                                     &inner_niter, NULL);
1252       if (!res)
1253         {
1254           if (dump_enabled_p ())
1255             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1256                              "not vectorized: Bad inner loop.\n");
1257           return res;
1258         }
1259
1260       /* Don't support analyzing niter under assumptions for inner
1261          loop.  */
1262       if (!integer_onep (inner_assumptions))
1263         return opt_result::failure_at (vect_location,
1264                                        "not vectorized: Bad inner loop.\n");
1265
1266       if (!expr_invariant_in_loop_p (loop, inner_niter))
1267         return opt_result::failure_at (vect_location,
1268                                        "not vectorized: inner-loop count not"
1269                                        " invariant.\n");
1270
1271       if (dump_enabled_p ())
1272         dump_printf_loc (MSG_NOTE, vect_location,
1273                          "Considering outer-loop vectorization.\n");
1274     }
1275
1276   if (!single_exit (loop))
1277     return opt_result::failure_at (vect_location,
1278                                    "not vectorized: multiple exits.\n");
1279   if (EDGE_COUNT (loop->header->preds) != 2)
1280     return opt_result::failure_at (vect_location,
1281                                    "not vectorized:"
1282                                    " too many incoming edges.\n");
1283
1284   /* We assume that the loop exit condition is at the end of the loop. i.e,
1285      that the loop is represented as a do-while (with a proper if-guard
1286      before the loop if needed), where the loop header contains all the
1287      executable statements, and the latch is empty.  */
1288   if (!empty_block_p (loop->latch)
1289       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1290     return opt_result::failure_at (vect_location,
1291                                    "not vectorized: latch block not empty.\n");
1292
1293   /* Make sure the exit is not abnormal.  */
1294   edge e = single_exit (loop);
1295   if (e->flags & EDGE_ABNORMAL)
1296     return opt_result::failure_at (vect_location,
1297                                    "not vectorized:"
1298                                    " abnormal loop exit edge.\n");
1299
1300   *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1301                                      number_of_iterationsm1);
1302   if (!*loop_cond)
1303     return opt_result::failure_at
1304       (vect_location,
1305        "not vectorized: complicated exit condition.\n");
1306
1307   if (integer_zerop (*assumptions)
1308       || !*number_of_iterations
1309       || chrec_contains_undetermined (*number_of_iterations))
1310     return opt_result::failure_at
1311       (*loop_cond,
1312        "not vectorized: number of iterations cannot be computed.\n");
1313
1314   if (integer_zerop (*number_of_iterations))
1315     return opt_result::failure_at
1316       (*loop_cond,
1317        "not vectorized: number of iterations = 0.\n");
1318
1319   return opt_result::success ();
1320 }
1321
1322 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form.  */
1323
1324 opt_loop_vec_info
1325 vect_analyze_loop_form (struct loop *loop, vec_info_shared *shared)
1326 {
1327   tree assumptions, number_of_iterations, number_of_iterationsm1;
1328   gcond *loop_cond, *inner_loop_cond = NULL;
1329
1330   opt_result res
1331     = vect_analyze_loop_form_1 (loop, &loop_cond,
1332                                 &assumptions, &number_of_iterationsm1,
1333                                 &number_of_iterations, &inner_loop_cond);
1334   if (!res)
1335     return opt_loop_vec_info::propagate_failure (res);
1336
1337   loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1338   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1339   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1340   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1341   if (!integer_onep (assumptions))
1342     {
1343       /* We consider to vectorize this loop by versioning it under
1344          some assumptions.  In order to do this, we need to clear
1345          existing information computed by scev and niter analyzer.  */
1346       scev_reset_htab ();
1347       free_numbers_of_iterations_estimates (loop);
1348       /* Also set flag for this loop so that following scev and niter
1349          analysis are done under the assumptions.  */
1350       loop_constraint_set (loop, LOOP_C_FINITE);
1351       /* Also record the assumptions for versioning.  */
1352       LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1353     }
1354
1355   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1356     {
1357       if (dump_enabled_p ())
1358         {
1359           dump_printf_loc (MSG_NOTE, vect_location,
1360                            "Symbolic number of iterations is ");
1361           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1362           dump_printf (MSG_NOTE, "\n");
1363         }
1364     }
1365
1366   stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (loop_cond);
1367   STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1368   if (inner_loop_cond)
1369     {
1370       stmt_vec_info inner_loop_cond_info
1371         = loop_vinfo->lookup_stmt (inner_loop_cond);
1372       STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1373     }
1374
1375   gcc_assert (!loop->aux);
1376   loop->aux = loop_vinfo;
1377   return opt_loop_vec_info::success (loop_vinfo);
1378 }
1379
1380
1381
1382 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1383    statements update the vectorization factor.  */
1384
1385 static void
1386 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1387 {
1388   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1389   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1390   int nbbs = loop->num_nodes;
1391   poly_uint64 vectorization_factor;
1392   int i;
1393
1394   DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1395
1396   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1397   gcc_assert (known_ne (vectorization_factor, 0U));
1398
1399   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1400      vectorization factor of the loop is the unrolling factor required by
1401      the SLP instances.  If that unrolling factor is 1, we say, that we
1402      perform pure SLP on loop - cross iteration parallelism is not
1403      exploited.  */
1404   bool only_slp_in_loop = true;
1405   for (i = 0; i < nbbs; i++)
1406     {
1407       basic_block bb = bbs[i];
1408       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1409            gsi_next (&si))
1410         {
1411           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1412           stmt_info = vect_stmt_to_vectorize (stmt_info);
1413           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1414                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1415               && !PURE_SLP_STMT (stmt_info))
1416             /* STMT needs both SLP and loop-based vectorization.  */
1417             only_slp_in_loop = false;
1418         }
1419     }
1420
1421   if (only_slp_in_loop)
1422     {
1423       if (dump_enabled_p ())
1424         dump_printf_loc (MSG_NOTE, vect_location,
1425                          "Loop contains only SLP stmts\n");
1426       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1427     }
1428   else
1429     {
1430       if (dump_enabled_p ())
1431         dump_printf_loc (MSG_NOTE, vect_location,
1432                          "Loop contains SLP and non-SLP stmts\n");
1433       /* Both the vectorization factor and unroll factor have the form
1434          current_vector_size * X for some rational X, so they must have
1435          a common multiple.  */
1436       vectorization_factor
1437         = force_common_multiple (vectorization_factor,
1438                                  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1439     }
1440
1441   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1442   if (dump_enabled_p ())
1443     {
1444       dump_printf_loc (MSG_NOTE, vect_location,
1445                        "Updating vectorization factor to ");
1446       dump_dec (MSG_NOTE, vectorization_factor);
1447       dump_printf (MSG_NOTE, ".\n");
1448     }
1449 }
1450
1451 /* Return true if STMT_INFO describes a double reduction phi and if
1452    the other phi in the reduction is also relevant for vectorization.
1453    This rejects cases such as:
1454
1455       outer1:
1456         x_1 = PHI <x_3(outer2), ...>;
1457         ...
1458
1459       inner:
1460         x_2 = ...;
1461         ...
1462
1463       outer2:
1464         x_3 = PHI <x_2(inner)>;
1465
1466    if nothing in x_2 or elsewhere makes x_1 relevant.  */
1467
1468 static bool
1469 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1470 {
1471   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1472     return false;
1473
1474   return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1475 }
1476
1477 /* Function vect_analyze_loop_operations.
1478
1479    Scan the loop stmts and make sure they are all vectorizable.  */
1480
1481 static opt_result
1482 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1483 {
1484   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1485   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1486   int nbbs = loop->num_nodes;
1487   int i;
1488   stmt_vec_info stmt_info;
1489   bool need_to_vectorize = false;
1490   bool ok;
1491
1492   DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1493
1494   auto_vec<stmt_info_for_cost> cost_vec;
1495
1496   for (i = 0; i < nbbs; i++)
1497     {
1498       basic_block bb = bbs[i];
1499
1500       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1501            gsi_next (&si))
1502         {
1503           gphi *phi = si.phi ();
1504           ok = true;
1505
1506           stmt_info = loop_vinfo->lookup_stmt (phi);
1507           if (dump_enabled_p ())
1508             dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G", phi);
1509           if (virtual_operand_p (gimple_phi_result (phi)))
1510             continue;
1511
1512           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1513              (i.e., a phi in the tail of the outer-loop).  */
1514           if (! is_loop_header_bb_p (bb))
1515             {
1516               /* FORNOW: we currently don't support the case that these phis
1517                  are not used in the outerloop (unless it is double reduction,
1518                  i.e., this phi is vect_reduction_def), cause this case
1519                  requires to actually do something here.  */
1520               if (STMT_VINFO_LIVE_P (stmt_info)
1521                   && !vect_active_double_reduction_p (stmt_info))
1522                 return opt_result::failure_at (phi,
1523                                                "Unsupported loop-closed phi"
1524                                                " in outer-loop.\n");
1525
1526               /* If PHI is used in the outer loop, we check that its operand
1527                  is defined in the inner loop.  */
1528               if (STMT_VINFO_RELEVANT_P (stmt_info))
1529                 {
1530                   tree phi_op;
1531
1532                   if (gimple_phi_num_args (phi) != 1)
1533                     return opt_result::failure_at (phi, "unsupported phi");
1534
1535                   phi_op = PHI_ARG_DEF (phi, 0);
1536                   stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1537                   if (!op_def_info)
1538                     return opt_result::failure_at (phi, "unsupported phi");
1539
1540                   if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1541                       && (STMT_VINFO_RELEVANT (op_def_info)
1542                           != vect_used_in_outer_by_reduction))
1543                     return opt_result::failure_at (phi, "unsupported phi");
1544                 }
1545
1546               continue;
1547             }
1548
1549           gcc_assert (stmt_info);
1550
1551           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1552                || STMT_VINFO_LIVE_P (stmt_info))
1553               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1554             /* A scalar-dependence cycle that we don't support.  */
1555             return opt_result::failure_at (phi,
1556                                            "not vectorized:"
1557                                            " scalar dependence cycle.\n");
1558
1559           if (STMT_VINFO_RELEVANT_P (stmt_info))
1560             {
1561               need_to_vectorize = true;
1562               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1563                   && ! PURE_SLP_STMT (stmt_info))
1564                 ok = vectorizable_induction (stmt_info, NULL, NULL, NULL,
1565                                              &cost_vec);
1566               else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1567                         || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1568                        && ! PURE_SLP_STMT (stmt_info))
1569                 ok = vectorizable_reduction (stmt_info, NULL, NULL, NULL, NULL,
1570                                              &cost_vec);
1571             }
1572
1573           /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
1574           if (ok
1575               && STMT_VINFO_LIVE_P (stmt_info)
1576               && !PURE_SLP_STMT (stmt_info))
1577             ok = vectorizable_live_operation (stmt_info, NULL, NULL, -1, NULL,
1578                                               &cost_vec);
1579
1580           if (!ok)
1581             return opt_result::failure_at (phi,
1582                                            "not vectorized: relevant phi not "
1583                                            "supported: %G",
1584                                            static_cast <gimple *> (phi));
1585         }
1586
1587       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1588            gsi_next (&si))
1589         {
1590           gimple *stmt = gsi_stmt (si);
1591           if (!gimple_clobber_p (stmt))
1592             {
1593               opt_result res
1594                 = vect_analyze_stmt (loop_vinfo->lookup_stmt (stmt),
1595                                      &need_to_vectorize,
1596                                      NULL, NULL, &cost_vec);
1597               if (!res)
1598                 return res;
1599             }
1600         }
1601     } /* bbs */
1602
1603   add_stmt_costs (loop_vinfo->target_cost_data, &cost_vec);
1604
1605   /* All operations in the loop are either irrelevant (deal with loop
1606      control, or dead), or only used outside the loop and can be moved
1607      out of the loop (e.g. invariants, inductions).  The loop can be
1608      optimized away by scalar optimizations.  We're better off not
1609      touching this loop.  */
1610   if (!need_to_vectorize)
1611     {
1612       if (dump_enabled_p ())
1613         dump_printf_loc (MSG_NOTE, vect_location,
1614                          "All the computation can be taken out of the loop.\n");
1615       return opt_result::failure_at
1616         (vect_location,
1617          "not vectorized: redundant loop. no profit to vectorize.\n");
1618     }
1619
1620   return opt_result::success ();
1621 }
1622
1623 /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
1624    is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
1625    definitely no, or -1 if it's worth retrying.  */
1626
1627 static int
1628 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1629 {
1630   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1631   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1632
1633   /* Only fully-masked loops can have iteration counts less than the
1634      vectorization factor.  */
1635   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1636     {
1637       HOST_WIDE_INT max_niter;
1638
1639       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1640         max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1641       else
1642         max_niter = max_stmt_executions_int (loop);
1643
1644       if (max_niter != -1
1645           && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1646         {
1647           if (dump_enabled_p ())
1648             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1649                              "not vectorized: iteration count smaller than "
1650                              "vectorization factor.\n");
1651           return 0;
1652         }
1653     }
1654
1655   int min_profitable_iters, min_profitable_estimate;
1656   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1657                                       &min_profitable_estimate);
1658
1659   if (min_profitable_iters < 0)
1660     {
1661       if (dump_enabled_p ())
1662         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1663                          "not vectorized: vectorization not profitable.\n");
1664       if (dump_enabled_p ())
1665         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1666                          "not vectorized: vector version will never be "
1667                          "profitable.\n");
1668       return -1;
1669     }
1670
1671   int min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1672                                * assumed_vf);
1673
1674   /* Use the cost model only if it is more conservative than user specified
1675      threshold.  */
1676   unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1677                                     min_profitable_iters);
1678
1679   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1680
1681   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1682       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1683     {
1684       if (dump_enabled_p ())
1685         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1686                          "not vectorized: vectorization not profitable.\n");
1687       if (dump_enabled_p ())
1688         dump_printf_loc (MSG_NOTE, vect_location,
1689                          "not vectorized: iteration count smaller than user "
1690                          "specified loop bound parameter or minimum profitable "
1691                          "iterations (whichever is more conservative).\n");
1692       return 0;
1693     }
1694
1695   HOST_WIDE_INT estimated_niter = estimated_stmt_executions_int (loop);
1696   if (estimated_niter == -1)
1697     estimated_niter = likely_max_stmt_executions_int (loop);
1698   if (estimated_niter != -1
1699       && ((unsigned HOST_WIDE_INT) estimated_niter
1700           < MAX (th, (unsigned) min_profitable_estimate)))
1701     {
1702       if (dump_enabled_p ())
1703         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1704                          "not vectorized: estimated iteration count too "
1705                          "small.\n");
1706       if (dump_enabled_p ())
1707         dump_printf_loc (MSG_NOTE, vect_location,
1708                          "not vectorized: estimated iteration count smaller "
1709                          "than specified loop bound parameter or minimum "
1710                          "profitable iterations (whichever is more "
1711                          "conservative).\n");
1712       return -1;
1713     }
1714
1715   return 1;
1716 }
1717
1718 static opt_result
1719 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1720                            vec<data_reference_p> *datarefs,
1721                            unsigned int *n_stmts)
1722 {
1723   *n_stmts = 0;
1724   for (unsigned i = 0; i < loop->num_nodes; i++)
1725     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1726          !gsi_end_p (gsi); gsi_next (&gsi))
1727       {
1728         gimple *stmt = gsi_stmt (gsi);
1729         if (is_gimple_debug (stmt))
1730           continue;
1731         ++(*n_stmts);
1732         opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs);
1733         if (!res)
1734           {
1735             if (is_gimple_call (stmt) && loop->safelen)
1736               {
1737                 tree fndecl = gimple_call_fndecl (stmt), op;
1738                 if (fndecl != NULL_TREE)
1739                   {
1740                     cgraph_node *node = cgraph_node::get (fndecl);
1741                     if (node != NULL && node->simd_clones != NULL)
1742                       {
1743                         unsigned int j, n = gimple_call_num_args (stmt);
1744                         for (j = 0; j < n; j++)
1745                           {
1746                             op = gimple_call_arg (stmt, j);
1747                             if (DECL_P (op)
1748                                 || (REFERENCE_CLASS_P (op)
1749                                     && get_base_address (op)))
1750                               break;
1751                           }
1752                         op = gimple_call_lhs (stmt);
1753                         /* Ignore #pragma omp declare simd functions
1754                            if they don't have data references in the
1755                            call stmt itself.  */
1756                         if (j == n
1757                             && !(op
1758                                  && (DECL_P (op)
1759                                      || (REFERENCE_CLASS_P (op)
1760                                          && get_base_address (op)))))
1761                           continue;
1762                       }
1763                   }
1764               }
1765             return res;
1766           }
1767         /* If dependence analysis will give up due to the limit on the
1768            number of datarefs stop here and fail fatally.  */
1769         if (datarefs->length ()
1770             > (unsigned)PARAM_VALUE (PARAM_LOOP_MAX_DATAREFS_FOR_DATADEPS))
1771           return opt_result::failure_at (stmt, "exceeded param "
1772                                          "loop-max-datarefs-for-datadeps\n");
1773       }
1774   return opt_result::success ();
1775 }
1776
1777 /* Function vect_analyze_loop_2.
1778
1779    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1780    for it.  The different analyses will record information in the
1781    loop_vec_info struct.  */
1782 static opt_result
1783 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
1784 {
1785   opt_result ok = opt_result::success ();
1786   int res;
1787   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
1788   poly_uint64 min_vf = 2;
1789
1790   /* The first group of checks is independent of the vector size.  */
1791   fatal = true;
1792
1793   if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
1794       && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
1795     return opt_result::failure_at (vect_location,
1796                                    "not vectorized: simd if(0)\n");
1797
1798   /* Find all data references in the loop (which correspond to vdefs/vuses)
1799      and analyze their evolution in the loop.  */
1800
1801   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1802
1803   /* Gather the data references and count stmts in the loop.  */
1804   if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
1805     {
1806       opt_result res
1807         = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
1808                                      &LOOP_VINFO_DATAREFS (loop_vinfo),
1809                                      n_stmts);
1810       if (!res)
1811         {
1812           if (dump_enabled_p ())
1813             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1814                              "not vectorized: loop contains function "
1815                              "calls or data references that cannot "
1816                              "be analyzed\n");
1817           return res;
1818         }
1819       loop_vinfo->shared->save_datarefs ();
1820     }
1821   else
1822     loop_vinfo->shared->check_datarefs ();
1823
1824   /* Analyze the data references and also adjust the minimal
1825      vectorization factor according to the loads and stores.  */
1826
1827   ok = vect_analyze_data_refs (loop_vinfo, &min_vf);
1828   if (!ok)
1829     {
1830       if (dump_enabled_p ())
1831         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1832                          "bad data references.\n");
1833       return ok;
1834     }
1835
1836   /* Classify all cross-iteration scalar data-flow cycles.
1837      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1838   vect_analyze_scalar_cycles (loop_vinfo);
1839
1840   vect_pattern_recog (loop_vinfo);
1841
1842   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1843
1844   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1845      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1846
1847   ok = vect_analyze_data_ref_accesses (loop_vinfo);
1848   if (!ok)
1849     {
1850       if (dump_enabled_p ())
1851         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1852                          "bad data access.\n");
1853       return ok;
1854     }
1855
1856   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1857
1858   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1859   if (!ok)
1860     {
1861       if (dump_enabled_p ())
1862         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1863                          "unexpected pattern.\n");
1864       return ok;
1865     }
1866
1867   /* While the rest of the analysis below depends on it in some way.  */
1868   fatal = false;
1869
1870   /* Analyze data dependences between the data-refs in the loop
1871      and adjust the maximum vectorization factor according to
1872      the dependences.
1873      FORNOW: fail at the first data dependence that we encounter.  */
1874
1875   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1876   if (!ok)
1877     {
1878       if (dump_enabled_p ())
1879         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1880                          "bad data dependence.\n");
1881       return ok;
1882     }
1883   if (max_vf != MAX_VECTORIZATION_FACTOR
1884       && maybe_lt (max_vf, min_vf))
1885     return opt_result::failure_at (vect_location, "bad data dependence.\n");
1886   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
1887
1888   ok = vect_determine_vectorization_factor (loop_vinfo);
1889   if (!ok)
1890     {
1891       if (dump_enabled_p ())
1892         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1893                          "can't determine vectorization factor.\n");
1894       return ok;
1895     }
1896   if (max_vf != MAX_VECTORIZATION_FACTOR
1897       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1898     return opt_result::failure_at (vect_location, "bad data dependence.\n");
1899
1900   /* Compute the scalar iteration cost.  */
1901   vect_compute_single_scalar_iteration_cost (loop_vinfo);
1902
1903   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1904   unsigned th;
1905
1906   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
1907   ok = vect_analyze_slp (loop_vinfo, *n_stmts);
1908   if (!ok)
1909     return ok;
1910
1911   /* If there are any SLP instances mark them as pure_slp.  */
1912   bool slp = vect_make_slp_decision (loop_vinfo);
1913   if (slp)
1914     {
1915       /* Find stmts that need to be both vectorized and SLPed.  */
1916       vect_detect_hybrid_slp (loop_vinfo);
1917
1918       /* Update the vectorization factor based on the SLP decision.  */
1919       vect_update_vf_for_slp (loop_vinfo);
1920     }
1921
1922   bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo);
1923
1924   /* We don't expect to have to roll back to anything other than an empty
1925      set of rgroups.  */
1926   gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
1927
1928   /* This is the point where we can re-start analysis with SLP forced off.  */
1929 start_over:
1930
1931   /* Now the vectorization factor is final.  */
1932   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1933   gcc_assert (known_ne (vectorization_factor, 0U));
1934
1935   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1936     {
1937       dump_printf_loc (MSG_NOTE, vect_location,
1938                        "vectorization_factor = ");
1939       dump_dec (MSG_NOTE, vectorization_factor);
1940       dump_printf (MSG_NOTE, ", niters = %wd\n",
1941                    LOOP_VINFO_INT_NITERS (loop_vinfo));
1942     }
1943
1944   HOST_WIDE_INT max_niter
1945     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1946
1947   /* Analyze the alignment of the data-refs in the loop.
1948      Fail if a data reference is found that cannot be vectorized.  */
1949
1950   ok = vect_analyze_data_refs_alignment (loop_vinfo);
1951   if (!ok)
1952     {
1953       if (dump_enabled_p ())
1954         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1955                          "bad data alignment.\n");
1956       return ok;
1957     }
1958
1959   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1960      It is important to call pruning after vect_analyze_data_ref_accesses,
1961      since we use grouping information gathered by interleaving analysis.  */
1962   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1963   if (!ok)
1964     return ok;
1965
1966   /* Do not invoke vect_enhance_data_refs_alignment for epilogue
1967      vectorization, since we do not want to add extra peeling or
1968      add versioning for alignment.  */
1969   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1970     /* This pass will decide on using loop versioning and/or loop peeling in
1971        order to enhance the alignment of data references in the loop.  */
1972     ok = vect_enhance_data_refs_alignment (loop_vinfo);
1973   else
1974     ok = vect_verify_datarefs_alignment (loop_vinfo);
1975   if (!ok)
1976     return ok;
1977
1978   if (slp)
1979     {
1980       /* Analyze operations in the SLP instances.  Note this may
1981          remove unsupported SLP instances which makes the above
1982          SLP kind detection invalid.  */
1983       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
1984       vect_slp_analyze_operations (loop_vinfo);
1985       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
1986         {
1987           ok = opt_result::failure_at (vect_location,
1988                                        "unsupported SLP instances\n");
1989           goto again;
1990         }
1991     }
1992
1993   /* Scan all the remaining operations in the loop that are not subject
1994      to SLP and make sure they are vectorizable.  */
1995   ok = vect_analyze_loop_operations (loop_vinfo);
1996   if (!ok)
1997     {
1998       if (dump_enabled_p ())
1999         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2000                          "bad operation or unsupported loop bound.\n");
2001       return ok;
2002     }
2003
2004   /* Decide whether to use a fully-masked loop for this vectorization
2005      factor.  */
2006   LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2007     = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
2008        && vect_verify_full_masking (loop_vinfo));
2009   if (dump_enabled_p ())
2010     {
2011       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2012         dump_printf_loc (MSG_NOTE, vect_location,
2013                          "using a fully-masked loop.\n");
2014       else
2015         dump_printf_loc (MSG_NOTE, vect_location,
2016                          "not using a fully-masked loop.\n");
2017     }
2018
2019   /* If epilog loop is required because of data accesses with gaps,
2020      one additional iteration needs to be peeled.  Check if there is
2021      enough iterations for vectorization.  */
2022   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2023       && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2024       && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2025     {
2026       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2027       tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2028
2029       if (known_lt (wi::to_widest (scalar_niters), vf))
2030         return opt_result::failure_at (vect_location,
2031                                        "loop has no enough iterations to"
2032                                        " support peeling for gaps.\n");
2033     }
2034
2035   /* Check the costings of the loop make vectorizing worthwhile.  */
2036   res = vect_analyze_loop_costing (loop_vinfo);
2037   if (res < 0)
2038     {
2039       ok = opt_result::failure_at (vect_location,
2040                                    "Loop costings may not be worthwhile.\n");
2041       goto again;
2042     }
2043   if (!res)
2044     return opt_result::failure_at (vect_location,
2045                                    "Loop costings not worthwhile.\n");
2046
2047   /* Decide whether we need to create an epilogue loop to handle
2048      remaining scalar iterations.  */
2049   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2050
2051   unsigned HOST_WIDE_INT const_vf;
2052   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2053     /* The main loop handles all iterations.  */
2054     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2055   else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2056            && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
2057     {
2058       /* Work out the (constant) number of iterations that need to be
2059          peeled for reasons other than niters.  */
2060       unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2061       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2062         peel_niter += 1;
2063       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
2064                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2065         LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2066     }
2067   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2068            /* ??? When peeling for gaps but not alignment, we could
2069               try to check whether the (variable) niters is known to be
2070               VF * N + 1.  That's something of a niche case though.  */
2071            || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2072            || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
2073            || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2074                 < (unsigned) exact_log2 (const_vf))
2075                /* In case of versioning, check if the maximum number of
2076                   iterations is greater than th.  If they are identical,
2077                   the epilogue is unnecessary.  */
2078                && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2079                    || ((unsigned HOST_WIDE_INT) max_niter
2080                        > (th / const_vf) * const_vf))))
2081     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2082
2083   /* If an epilogue loop is required make sure we can create one.  */
2084   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2085       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2086     {
2087       if (dump_enabled_p ())
2088         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2089       if (!vect_can_advance_ivs_p (loop_vinfo)
2090           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2091                                            single_exit (LOOP_VINFO_LOOP
2092                                                          (loop_vinfo))))
2093         {
2094           ok = opt_result::failure_at (vect_location,
2095                                        "not vectorized: can't create required "
2096                                        "epilog loop\n");
2097           goto again;
2098         }
2099     }
2100
2101   /* During peeling, we need to check if number of loop iterations is
2102      enough for both peeled prolog loop and vector loop.  This check
2103      can be merged along with threshold check of loop versioning, so
2104      increase threshold for this case if necessary.  */
2105   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
2106     {
2107       poly_uint64 niters_th = 0;
2108
2109       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2110         {
2111           /* Niters for peeled prolog loop.  */
2112           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2113             {
2114               dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2115               tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2116               niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2117             }
2118           else
2119             niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2120         }
2121
2122       /* Niters for at least one iteration of vectorized loop.  */
2123       if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2124         niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2125       /* One additional iteration because of peeling for gap.  */
2126       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2127         niters_th += 1;
2128       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2129     }
2130
2131   gcc_assert (known_eq (vectorization_factor,
2132                         LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2133
2134   /* Ok to vectorize!  */
2135   return opt_result::success ();
2136
2137 again:
2138   /* Ensure that "ok" is false (with an opt_problem if dumping is enabled).  */
2139   gcc_assert (!ok);
2140
2141   /* Try again with SLP forced off but if we didn't do any SLP there is
2142      no point in re-trying.  */
2143   if (!slp)
2144     return ok;
2145
2146   /* If there are reduction chains re-trying will fail anyway.  */
2147   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2148     return ok;
2149
2150   /* Likewise if the grouped loads or stores in the SLP cannot be handled
2151      via interleaving or lane instructions.  */
2152   slp_instance instance;
2153   slp_tree node;
2154   unsigned i, j;
2155   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2156     {
2157       stmt_vec_info vinfo;
2158       vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2159       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2160         continue;
2161       vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2162       unsigned int size = DR_GROUP_SIZE (vinfo);
2163       tree vectype = STMT_VINFO_VECTYPE (vinfo);
2164       if (! vect_store_lanes_supported (vectype, size, false)
2165          && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2166          && ! vect_grouped_store_supported (vectype, size))
2167         return opt_result::failure_at (vinfo->stmt,
2168                                        "unsupported grouped store\n");
2169       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2170         {
2171           vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
2172           vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2173           bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2174           size = DR_GROUP_SIZE (vinfo);
2175           vectype = STMT_VINFO_VECTYPE (vinfo);
2176           if (! vect_load_lanes_supported (vectype, size, false)
2177               && ! vect_grouped_load_supported (vectype, single_element_p,
2178                                                 size))
2179             return opt_result::failure_at (vinfo->stmt,
2180                                            "unsupported grouped load\n");
2181         }
2182     }
2183
2184   if (dump_enabled_p ())
2185     dump_printf_loc (MSG_NOTE, vect_location,
2186                      "re-trying with SLP disabled\n");
2187
2188   /* Roll back state appropriately.  No SLP this time.  */
2189   slp = false;
2190   /* Restore vectorization factor as it were without SLP.  */
2191   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2192   /* Free the SLP instances.  */
2193   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2194     vect_free_slp_instance (instance, false);
2195   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2196   /* Reset SLP type to loop_vect on all stmts.  */
2197   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2198     {
2199       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2200       for (gimple_stmt_iterator si = gsi_start_phis (bb);
2201            !gsi_end_p (si); gsi_next (&si))
2202         {
2203           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2204           STMT_SLP_TYPE (stmt_info) = loop_vect;
2205         }
2206       for (gimple_stmt_iterator si = gsi_start_bb (bb);
2207            !gsi_end_p (si); gsi_next (&si))
2208         {
2209           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2210           STMT_SLP_TYPE (stmt_info) = loop_vect;
2211           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2212             {
2213               gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2214               stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
2215               STMT_SLP_TYPE (stmt_info) = loop_vect;
2216               for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2217                    !gsi_end_p (pi); gsi_next (&pi))
2218                 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2219                   = loop_vect;
2220             }
2221         }
2222     }
2223   /* Free optimized alias test DDRS.  */
2224   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2225   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2226   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2227   /* Reset target cost data.  */
2228   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2229   LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2230     = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2231   /* Reset accumulated rgroup information.  */
2232   release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo));
2233   /* Reset assorted flags.  */
2234   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2235   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2236   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2237   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2238   LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p;
2239
2240   goto start_over;
2241 }
2242
2243 /* Function vect_analyze_loop.
2244
2245    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2246    for it.  The different analyses will record information in the
2247    loop_vec_info struct.  If ORIG_LOOP_VINFO is not NULL epilogue must
2248    be vectorized.  */
2249 opt_loop_vec_info
2250 vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo,
2251                    vec_info_shared *shared)
2252 {
2253   auto_vector_sizes vector_sizes;
2254
2255   /* Autodetect first vector size we try.  */
2256   current_vector_size = 0;
2257   targetm.vectorize.autovectorize_vector_sizes (&vector_sizes,
2258                                                 loop->simdlen != 0);
2259   unsigned int next_size = 0;
2260
2261   DUMP_VECT_SCOPE ("analyze_loop_nest");
2262
2263   if (loop_outer (loop)
2264       && loop_vec_info_for_loop (loop_outer (loop))
2265       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2266     return opt_loop_vec_info::failure_at (vect_location,
2267                                           "outer-loop already vectorized.\n");
2268
2269   if (!find_loop_nest (loop, &shared->loop_nest))
2270     return opt_loop_vec_info::failure_at
2271       (vect_location,
2272        "not vectorized: loop nest containing two or more consecutive inner"
2273        " loops cannot be vectorized\n");
2274
2275   unsigned n_stmts = 0;
2276   poly_uint64 autodetected_vector_size = 0;
2277   opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
2278   poly_uint64 first_vector_size = 0;
2279   while (1)
2280     {
2281       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
2282       opt_loop_vec_info loop_vinfo
2283         = vect_analyze_loop_form (loop, shared);
2284       if (!loop_vinfo)
2285         {
2286           if (dump_enabled_p ())
2287             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2288                              "bad loop form.\n");
2289           gcc_checking_assert (first_loop_vinfo == NULL);
2290           return loop_vinfo;
2291         }
2292
2293       bool fatal = false;
2294
2295       if (orig_loop_vinfo)
2296         LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo;
2297
2298       opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts);
2299       if (res)
2300         {
2301           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2302
2303           if (loop->simdlen
2304               && maybe_ne (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2305                            (unsigned HOST_WIDE_INT) loop->simdlen))
2306             {
2307               if (first_loop_vinfo == NULL)
2308                 {
2309                   first_loop_vinfo = loop_vinfo;
2310                   first_vector_size = current_vector_size;
2311                   loop->aux = NULL;
2312                 }
2313               else
2314                 delete loop_vinfo;
2315             }
2316           else
2317             {
2318               delete first_loop_vinfo;
2319               return loop_vinfo;
2320             }
2321         }
2322       else
2323         delete loop_vinfo;
2324
2325       if (next_size == 0)
2326         autodetected_vector_size = current_vector_size;
2327
2328       if (next_size < vector_sizes.length ()
2329           && known_eq (vector_sizes[next_size], autodetected_vector_size))
2330         next_size += 1;
2331
2332       if (fatal)
2333         {
2334           gcc_checking_assert (first_loop_vinfo == NULL);
2335           return opt_loop_vec_info::propagate_failure (res);
2336         }
2337
2338       if (next_size == vector_sizes.length ()
2339           || known_eq (current_vector_size, 0U))
2340         {
2341           if (first_loop_vinfo)
2342             {
2343               current_vector_size = first_vector_size;
2344               loop->aux = (loop_vec_info) first_loop_vinfo;
2345               if (dump_enabled_p ())
2346                 {
2347                   dump_printf_loc (MSG_NOTE, vect_location,
2348                                    "***** Choosing vector size ");
2349                   dump_dec (MSG_NOTE, current_vector_size);
2350                   dump_printf (MSG_NOTE, "\n");
2351                 }
2352               return first_loop_vinfo;
2353             }
2354           else
2355             return opt_loop_vec_info::propagate_failure (res);
2356         }
2357
2358       /* Try the next biggest vector size.  */
2359       current_vector_size = vector_sizes[next_size++];
2360       if (dump_enabled_p ())
2361         {
2362           dump_printf_loc (MSG_NOTE, vect_location,
2363                            "***** Re-trying analysis with "
2364                            "vector size ");
2365           dump_dec (MSG_NOTE, current_vector_size);
2366           dump_printf (MSG_NOTE, "\n");
2367         }
2368     }
2369 }
2370
2371 /* Return true if there is an in-order reduction function for CODE, storing
2372    it in *REDUC_FN if so.  */
2373
2374 static bool
2375 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
2376 {
2377   switch (code)
2378     {
2379     case PLUS_EXPR:
2380       *reduc_fn = IFN_FOLD_LEFT_PLUS;
2381       return true;
2382
2383     default:
2384       return false;
2385     }
2386 }
2387
2388 /* Function reduction_fn_for_scalar_code
2389
2390    Input:
2391    CODE - tree_code of a reduction operations.
2392
2393    Output:
2394    REDUC_FN - the corresponding internal function to be used to reduce the
2395       vector of partial results into a single scalar result, or IFN_LAST
2396       if the operation is a supported reduction operation, but does not have
2397       such an internal function.
2398
2399    Return FALSE if CODE currently cannot be vectorized as reduction.  */
2400
2401 static bool
2402 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
2403 {
2404   switch (code)
2405     {
2406       case MAX_EXPR:
2407         *reduc_fn = IFN_REDUC_MAX;
2408         return true;
2409
2410       case MIN_EXPR:
2411         *reduc_fn = IFN_REDUC_MIN;
2412         return true;
2413
2414       case PLUS_EXPR:
2415         *reduc_fn = IFN_REDUC_PLUS;
2416         return true;
2417
2418       case BIT_AND_EXPR:
2419         *reduc_fn = IFN_REDUC_AND;
2420         return true;
2421
2422       case BIT_IOR_EXPR:
2423         *reduc_fn = IFN_REDUC_IOR;
2424         return true;
2425
2426       case BIT_XOR_EXPR:
2427         *reduc_fn = IFN_REDUC_XOR;
2428         return true;
2429
2430       case MULT_EXPR:
2431       case MINUS_EXPR:
2432         *reduc_fn = IFN_LAST;
2433         return true;
2434
2435       default:
2436        return false;
2437     }
2438 }
2439
2440 /* If there is a neutral value X such that SLP reduction NODE would not
2441    be affected by the introduction of additional X elements, return that X,
2442    otherwise return null.  CODE is the code of the reduction.  REDUC_CHAIN
2443    is true if the SLP statements perform a single reduction, false if each
2444    statement performs an independent reduction.  */
2445
2446 static tree
2447 neutral_op_for_slp_reduction (slp_tree slp_node, tree_code code,
2448                               bool reduc_chain)
2449 {
2450   vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2451   stmt_vec_info stmt_vinfo = stmts[0];
2452   tree vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
2453   tree scalar_type = TREE_TYPE (vector_type);
2454   struct loop *loop = gimple_bb (stmt_vinfo->stmt)->loop_father;
2455   gcc_assert (loop);
2456
2457   switch (code)
2458     {
2459     case WIDEN_SUM_EXPR:
2460     case DOT_PROD_EXPR:
2461     case SAD_EXPR:
2462     case PLUS_EXPR:
2463     case MINUS_EXPR:
2464     case BIT_IOR_EXPR:
2465     case BIT_XOR_EXPR:
2466       return build_zero_cst (scalar_type);
2467
2468     case MULT_EXPR:
2469       return build_one_cst (scalar_type);
2470
2471     case BIT_AND_EXPR:
2472       return build_all_ones_cst (scalar_type);
2473
2474     case MAX_EXPR:
2475     case MIN_EXPR:
2476       /* For MIN/MAX the initial values are neutral.  A reduction chain
2477          has only a single initial value, so that value is neutral for
2478          all statements.  */
2479       if (reduc_chain)
2480         return PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt,
2481                                       loop_preheader_edge (loop));
2482       return NULL_TREE;
2483
2484     default:
2485       return NULL_TREE;
2486     }
2487 }
2488
2489 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
2490    STMT is printed with a message MSG. */
2491
2492 static void
2493 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2494 {
2495   dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
2496 }
2497
2498 /* DEF_STMT_INFO occurs in a loop that contains a potential reduction
2499    operation.  Return true if the results of DEF_STMT_INFO are something
2500    that can be accumulated by such a reduction.  */
2501
2502 static bool
2503 vect_valid_reduction_input_p (stmt_vec_info def_stmt_info)
2504 {
2505   return (is_gimple_assign (def_stmt_info->stmt)
2506           || is_gimple_call (def_stmt_info->stmt)
2507           || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_induction_def
2508           || (gimple_code (def_stmt_info->stmt) == GIMPLE_PHI
2509               && STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_internal_def
2510               && !is_loop_header_bb_p (gimple_bb (def_stmt_info->stmt))));
2511 }
2512
2513 /* Detect SLP reduction of the form:
2514
2515    #a1 = phi <a5, a0>
2516    a2 = operation (a1)
2517    a3 = operation (a2)
2518    a4 = operation (a3)
2519    a5 = operation (a4)
2520
2521    #a = phi <a5>
2522
2523    PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2524    FIRST_STMT is the first reduction stmt in the chain
2525    (a2 = operation (a1)).
2526
2527    Return TRUE if a reduction chain was detected.  */
2528
2529 static bool
2530 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2531                        gimple *first_stmt)
2532 {
2533   struct loop *loop = (gimple_bb (phi))->loop_father;
2534   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2535   enum tree_code code;
2536   gimple *loop_use_stmt = NULL;
2537   stmt_vec_info use_stmt_info;
2538   tree lhs;
2539   imm_use_iterator imm_iter;
2540   use_operand_p use_p;
2541   int nloop_uses, size = 0, n_out_of_loop_uses;
2542   bool found = false;
2543
2544   if (loop != vect_loop)
2545     return false;
2546
2547   auto_vec<stmt_vec_info, 8> reduc_chain;
2548   lhs = PHI_RESULT (phi);
2549   code = gimple_assign_rhs_code (first_stmt);
2550   while (1)
2551     {
2552       nloop_uses = 0;
2553       n_out_of_loop_uses = 0;
2554       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2555         {
2556           gimple *use_stmt = USE_STMT (use_p);
2557           if (is_gimple_debug (use_stmt))
2558             continue;
2559
2560           /* Check if we got back to the reduction phi.  */
2561           if (use_stmt == phi)
2562             {
2563               loop_use_stmt = use_stmt;
2564               found = true;
2565               break;
2566             }
2567
2568           if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2569             {
2570               loop_use_stmt = use_stmt;
2571               nloop_uses++;
2572             }
2573            else
2574              n_out_of_loop_uses++;
2575
2576            /* There are can be either a single use in the loop or two uses in
2577               phi nodes.  */
2578            if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2579              return false;
2580         }
2581
2582       if (found)
2583         break;
2584
2585       /* We reached a statement with no loop uses.  */
2586       if (nloop_uses == 0)
2587         return false;
2588
2589       /* This is a loop exit phi, and we haven't reached the reduction phi.  */
2590       if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2591         return false;
2592
2593       if (!is_gimple_assign (loop_use_stmt)
2594           || code != gimple_assign_rhs_code (loop_use_stmt)
2595           || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2596         return false;
2597
2598       /* Insert USE_STMT into reduction chain.  */
2599       use_stmt_info = loop_info->lookup_stmt (loop_use_stmt);
2600       reduc_chain.safe_push (use_stmt_info);
2601
2602       lhs = gimple_assign_lhs (loop_use_stmt);
2603       size++;
2604    }
2605
2606   if (!found || loop_use_stmt != phi || size < 2)
2607     return false;
2608
2609   /* Swap the operands, if needed, to make the reduction operand be the second
2610      operand.  */
2611   lhs = PHI_RESULT (phi);
2612   for (unsigned i = 0; i < reduc_chain.length (); ++i)
2613     {
2614       gassign *next_stmt = as_a <gassign *> (reduc_chain[i]->stmt);
2615       if (gimple_assign_rhs2 (next_stmt) == lhs)
2616         {
2617           tree op = gimple_assign_rhs1 (next_stmt);
2618           stmt_vec_info def_stmt_info = loop_info->lookup_def (op);
2619
2620           /* Check that the other def is either defined in the loop
2621              ("vect_internal_def"), or it's an induction (defined by a
2622              loop-header phi-node).  */
2623           if (def_stmt_info
2624               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt))
2625               && vect_valid_reduction_input_p (def_stmt_info))
2626             {
2627               lhs = gimple_assign_lhs (next_stmt);
2628               continue;
2629             }
2630
2631           return false;
2632         }
2633       else
2634         {
2635           tree op = gimple_assign_rhs2 (next_stmt);
2636           stmt_vec_info def_stmt_info = loop_info->lookup_def (op);
2637
2638           /* Check that the other def is either defined in the loop
2639             ("vect_internal_def"), or it's an induction (defined by a
2640             loop-header phi-node).  */
2641           if (def_stmt_info
2642               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt))
2643               && vect_valid_reduction_input_p (def_stmt_info))
2644             {
2645               if (dump_enabled_p ())
2646                 dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: %G",
2647                                  next_stmt);
2648
2649               swap_ssa_operands (next_stmt,
2650                                  gimple_assign_rhs1_ptr (next_stmt),
2651                                  gimple_assign_rhs2_ptr (next_stmt));
2652               update_stmt (next_stmt);
2653
2654               if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2655                 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2656             }
2657           else
2658             return false;
2659         }
2660
2661       lhs = gimple_assign_lhs (next_stmt);
2662     }
2663
2664   /* Build up the actual chain.  */
2665   for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
2666     {
2667       REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
2668       REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
2669     }
2670   REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
2671   REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
2672
2673   /* Save the chain for further analysis in SLP detection.  */
2674   LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
2675   REDUC_GROUP_SIZE (reduc_chain[0]) = size;
2676
2677   return true;
2678 }
2679
2680 /* Return true if we need an in-order reduction for operation CODE
2681    on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
2682    overflow must wrap.  */
2683
2684 static bool
2685 needs_fold_left_reduction_p (tree type, tree_code code,
2686                              bool need_wrapping_integral_overflow)
2687 {
2688   /* CHECKME: check for !flag_finite_math_only too?  */
2689   if (SCALAR_FLOAT_TYPE_P (type))
2690     switch (code)
2691       {
2692       case MIN_EXPR:
2693       case MAX_EXPR:
2694         return false;
2695
2696       default:
2697         return !flag_associative_math;
2698       }
2699
2700   if (INTEGRAL_TYPE_P (type))
2701     {
2702       if (!operation_no_trapping_overflow (type, code))
2703         return true;
2704       if (need_wrapping_integral_overflow
2705           && !TYPE_OVERFLOW_WRAPS (type)
2706           && operation_can_overflow (code))
2707         return true;
2708       return false;
2709     }
2710
2711   if (SAT_FIXED_POINT_TYPE_P (type))
2712     return true;
2713
2714   return false;
2715 }
2716
2717 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
2718    reduction operation CODE has a handled computation expression.  */
2719
2720 bool
2721 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
2722                       tree loop_arg, enum tree_code code)
2723 {
2724   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
2725   auto_bitmap visited;
2726   tree lookfor = PHI_RESULT (phi);
2727   ssa_op_iter curri;
2728   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
2729   while (USE_FROM_PTR (curr) != loop_arg)
2730     curr = op_iter_next_use (&curri);
2731   curri.i = curri.numops;
2732   do
2733     {
2734       path.safe_push (std::make_pair (curri, curr));
2735       tree use = USE_FROM_PTR (curr);
2736       if (use == lookfor)
2737         break;
2738       gimple *def = SSA_NAME_DEF_STMT (use);
2739       if (gimple_nop_p (def)
2740           || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
2741         {
2742 pop:
2743           do
2744             {
2745               std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
2746               curri = x.first;
2747               curr = x.second;
2748               do
2749                 curr = op_iter_next_use (&curri);
2750               /* Skip already visited or non-SSA operands (from iterating
2751                  over PHI args).  */
2752               while (curr != NULL_USE_OPERAND_P
2753                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2754                          || ! bitmap_set_bit (visited,
2755                                               SSA_NAME_VERSION
2756                                                 (USE_FROM_PTR (curr)))));
2757             }
2758           while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
2759           if (curr == NULL_USE_OPERAND_P)
2760             break;
2761         }
2762       else
2763         {
2764           if (gimple_code (def) == GIMPLE_PHI)
2765             curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
2766           else
2767             curr = op_iter_init_use (&curri, def, SSA_OP_USE);
2768           while (curr != NULL_USE_OPERAND_P
2769                  && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2770                      || ! bitmap_set_bit (visited,
2771                                           SSA_NAME_VERSION
2772                                             (USE_FROM_PTR (curr)))))
2773             curr = op_iter_next_use (&curri);
2774           if (curr == NULL_USE_OPERAND_P)
2775             goto pop;
2776         }
2777     }
2778   while (1);
2779   if (dump_file && (dump_flags & TDF_DETAILS))
2780     {
2781       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
2782       unsigned i;
2783       std::pair<ssa_op_iter, use_operand_p> *x;
2784       FOR_EACH_VEC_ELT (path, i, x)
2785         dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
2786       dump_printf (MSG_NOTE, "\n");
2787     }
2788
2789   /* Check whether the reduction path detected is valid.  */
2790   bool fail = path.length () == 0;
2791   bool neg = false;
2792   for (unsigned i = 1; i < path.length (); ++i)
2793     {
2794       gimple *use_stmt = USE_STMT (path[i].second);
2795       tree op = USE_FROM_PTR (path[i].second);
2796       if (! has_single_use (op)
2797           || ! is_gimple_assign (use_stmt))
2798         {
2799           fail = true;
2800           break;
2801         }
2802       if (gimple_assign_rhs_code (use_stmt) != code)
2803         {
2804           if (code == PLUS_EXPR
2805               && gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
2806             {
2807               /* Track whether we negate the reduction value each iteration.  */
2808               if (gimple_assign_rhs2 (use_stmt) == op)
2809                 neg = ! neg;
2810             }
2811           else
2812             {
2813               fail = true;
2814               break;
2815             }
2816         }
2817     }
2818   return ! fail && ! neg;
2819 }
2820
2821
2822 /* Function vect_is_simple_reduction
2823
2824    (1) Detect a cross-iteration def-use cycle that represents a simple
2825    reduction computation.  We look for the following pattern:
2826
2827    loop_header:
2828      a1 = phi < a0, a2 >
2829      a3 = ...
2830      a2 = operation (a3, a1)
2831
2832    or
2833
2834    a3 = ...
2835    loop_header:
2836      a1 = phi < a0, a2 >
2837      a2 = operation (a3, a1)
2838
2839    such that:
2840    1. operation is commutative and associative and it is safe to
2841       change the order of the computation
2842    2. no uses for a2 in the loop (a2 is used out of the loop)
2843    3. no uses of a1 in the loop besides the reduction operation
2844    4. no uses of a1 outside the loop.
2845
2846    Conditions 1,4 are tested here.
2847    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2848
2849    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2850    nested cycles.
2851
2852    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2853    reductions:
2854
2855      a1 = phi < a0, a2 >
2856      inner loop (def of a3)
2857      a2 = phi < a3 >
2858
2859    (4) Detect condition expressions, ie:
2860      for (int i = 0; i < N; i++)
2861        if (a[i] < val)
2862         ret_val = a[i];
2863
2864 */
2865
2866 static stmt_vec_info
2867 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
2868                           bool *double_reduc,
2869                           bool need_wrapping_integral_overflow,
2870                           enum vect_reduction_type *v_reduc_type)
2871 {
2872   gphi *phi = as_a <gphi *> (phi_info->stmt);
2873   struct loop *loop = (gimple_bb (phi))->loop_father;
2874   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2875   bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
2876   gimple *phi_use_stmt = NULL;
2877   enum tree_code orig_code, code;
2878   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2879   tree type;
2880   tree name;
2881   imm_use_iterator imm_iter;
2882   use_operand_p use_p;
2883   bool phi_def;
2884
2885   *double_reduc = false;
2886   *v_reduc_type = TREE_CODE_REDUCTION;
2887
2888   tree phi_name = PHI_RESULT (phi);
2889   /* ???  If there are no uses of the PHI result the inner loop reduction
2890      won't be detected as possibly double-reduction by vectorizable_reduction
2891      because that tries to walk the PHI arg from the preheader edge which
2892      can be constant.  See PR60382.  */
2893   if (has_zero_uses (phi_name))
2894     return NULL;
2895   unsigned nphi_def_loop_uses = 0;
2896   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
2897     {
2898       gimple *use_stmt = USE_STMT (use_p);
2899       if (is_gimple_debug (use_stmt))
2900         continue;
2901
2902       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2903         {
2904           if (dump_enabled_p ())
2905             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2906                              "intermediate value used outside loop.\n");
2907
2908           return NULL;
2909         }
2910
2911       nphi_def_loop_uses++;
2912       phi_use_stmt = use_stmt;
2913     }
2914
2915   edge latch_e = loop_latch_edge (loop);
2916   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2917   if (TREE_CODE (loop_arg) != SSA_NAME)
2918     {
2919       if (dump_enabled_p ())
2920         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2921                          "reduction: not ssa_name: %T\n", loop_arg);
2922       return NULL;
2923     }
2924
2925   stmt_vec_info def_stmt_info = loop_info->lookup_def (loop_arg);
2926   if (!def_stmt_info
2927       || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
2928     return NULL;
2929
2930   if (gassign *def_stmt = dyn_cast <gassign *> (def_stmt_info->stmt))
2931     {
2932       name = gimple_assign_lhs (def_stmt);
2933       phi_def = false;
2934     }
2935   else if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
2936     {
2937       name = PHI_RESULT (def_stmt);
2938       phi_def = true;
2939     }
2940   else
2941     {
2942       if (dump_enabled_p ())
2943         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2944                          "reduction: unhandled reduction operation: %G",
2945                          def_stmt_info->stmt);
2946       return NULL;
2947     }
2948
2949   unsigned nlatch_def_loop_uses = 0;
2950   auto_vec<gphi *, 3> lcphis;
2951   bool inner_loop_of_double_reduc = false;
2952   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2953     {
2954       gimple *use_stmt = USE_STMT (use_p);
2955       if (is_gimple_debug (use_stmt))
2956         continue;
2957       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2958         nlatch_def_loop_uses++;
2959       else
2960         {
2961           /* We can have more than one loop-closed PHI.  */
2962           lcphis.safe_push (as_a <gphi *> (use_stmt));
2963           if (nested_in_vect_loop
2964               && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
2965                   == vect_double_reduction_def))
2966             inner_loop_of_double_reduc = true;
2967         }
2968     }
2969
2970   /* If this isn't a nested cycle or if the nested cycle reduction value
2971      is used ouside of the inner loop we cannot handle uses of the reduction
2972      value.  */
2973   if ((!nested_in_vect_loop || inner_loop_of_double_reduc)
2974       && (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1))
2975     {
2976       if (dump_enabled_p ())
2977         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2978                          "reduction used in loop.\n");
2979       return NULL;
2980     }
2981
2982   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2983      defined in the inner loop.  */
2984   if (phi_def)
2985     {
2986       gphi *def_stmt = as_a <gphi *> (def_stmt_info->stmt);
2987       op1 = PHI_ARG_DEF (def_stmt, 0);
2988
2989       if (gimple_phi_num_args (def_stmt) != 1
2990           || TREE_CODE (op1) != SSA_NAME)
2991         {
2992           if (dump_enabled_p ())
2993             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2994                              "unsupported phi node definition.\n");
2995
2996           return NULL;
2997         }
2998
2999       gimple *def1 = SSA_NAME_DEF_STMT (op1);
3000       if (gimple_bb (def1)
3001           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3002           && loop->inner
3003           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3004           && is_gimple_assign (def1)
3005           && is_a <gphi *> (phi_use_stmt)
3006           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3007         {
3008           if (dump_enabled_p ())
3009             report_vect_op (MSG_NOTE, def_stmt,
3010                             "detected double reduction: ");
3011
3012           *double_reduc = true;
3013           return def_stmt_info;
3014         }
3015
3016       return NULL;
3017     }
3018
3019   /* If we are vectorizing an inner reduction we are executing that
3020      in the original order only in case we are not dealing with a
3021      double reduction.  */
3022   bool check_reduction = true;
3023   if (flow_loop_nested_p (vect_loop, loop))
3024     {
3025       gphi *lcphi;
3026       unsigned i;
3027       check_reduction = false;
3028       FOR_EACH_VEC_ELT (lcphis, i, lcphi)
3029         FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result (lcphi))
3030           {
3031             gimple *use_stmt = USE_STMT (use_p);
3032             if (is_gimple_debug (use_stmt))
3033               continue;
3034             if (! flow_bb_inside_loop_p (vect_loop, gimple_bb (use_stmt)))
3035               check_reduction = true;
3036           }
3037     }
3038
3039   gassign *def_stmt = as_a <gassign *> (def_stmt_info->stmt);
3040   code = orig_code = gimple_assign_rhs_code (def_stmt);
3041
3042   if (nested_in_vect_loop && !check_reduction)
3043     {
3044       /* FIXME: Even for non-reductions code generation is funneled
3045          through vectorizable_reduction for the stmt defining the
3046          PHI latch value.  So we have to artificially restrict ourselves
3047          for the supported operations.  */
3048       switch (get_gimple_rhs_class (code))
3049         {
3050         case GIMPLE_BINARY_RHS:
3051         case GIMPLE_TERNARY_RHS:
3052           break;
3053         default:
3054           /* Not supported by vectorizable_reduction.  */
3055           if (dump_enabled_p ())
3056             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3057                             "nested cycle: not handled operation: ");
3058           return NULL;
3059         }
3060       if (dump_enabled_p ())
3061         report_vect_op (MSG_NOTE, def_stmt, "detected nested cycle: ");
3062       return def_stmt_info;
3063     }
3064
3065   /* We can handle "res -= x[i]", which is non-associative by
3066      simply rewriting this into "res += -x[i]".  Avoid changing
3067      gimple instruction for the first simple tests and only do this
3068      if we're allowed to change code at all.  */
3069   if (code == MINUS_EXPR && gimple_assign_rhs2 (def_stmt) != phi_name)
3070     code = PLUS_EXPR;
3071
3072   if (code == COND_EXPR)
3073     {
3074       if (! nested_in_vect_loop)
3075         *v_reduc_type = COND_REDUCTION;
3076
3077       op3 = gimple_assign_rhs1 (def_stmt);
3078       if (COMPARISON_CLASS_P (op3))
3079         {
3080           op4 = TREE_OPERAND (op3, 1);
3081           op3 = TREE_OPERAND (op3, 0);
3082         }
3083       if (op3 == phi_name || op4 == phi_name)
3084         {
3085           if (dump_enabled_p ())
3086             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3087                             "reduction: condition depends on previous"
3088                             " iteration: ");
3089           return NULL;
3090         }
3091
3092       op1 = gimple_assign_rhs2 (def_stmt);
3093       op2 = gimple_assign_rhs3 (def_stmt);
3094     }
3095   else if (!commutative_tree_code (code) || !associative_tree_code (code))
3096     {
3097       if (dump_enabled_p ())
3098         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3099                         "reduction: not commutative/associative: ");
3100       return NULL;
3101     }
3102   else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS)
3103     {
3104       op1 = gimple_assign_rhs1 (def_stmt);
3105       op2 = gimple_assign_rhs2 (def_stmt);
3106     }
3107   else
3108     {
3109       if (dump_enabled_p ())
3110         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3111                         "reduction: not handled operation: ");
3112       return NULL;
3113     }
3114
3115   if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
3116     {
3117       if (dump_enabled_p ())
3118         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3119                         "reduction: both uses not ssa_names: ");
3120
3121       return NULL;
3122     }
3123
3124   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
3125   if ((TREE_CODE (op1) == SSA_NAME
3126        && !types_compatible_p (type,TREE_TYPE (op1)))
3127       || (TREE_CODE (op2) == SSA_NAME
3128           && !types_compatible_p (type, TREE_TYPE (op2)))
3129       || (op3 && TREE_CODE (op3) == SSA_NAME
3130           && !types_compatible_p (type, TREE_TYPE (op3)))
3131       || (op4 && TREE_CODE (op4) == SSA_NAME
3132           && !types_compatible_p (type, TREE_TYPE (op4))))
3133     {
3134       if (dump_enabled_p ())
3135         {
3136           dump_printf_loc (MSG_NOTE, vect_location,
3137                            "reduction: multiple types: operation type: "
3138                            "%T, operands types: %T,%T",
3139                            type,  TREE_TYPE (op1), TREE_TYPE (op2));
3140           if (op3)
3141             dump_printf (MSG_NOTE, ",%T", TREE_TYPE (op3));
3142
3143           if (op4)
3144             dump_printf (MSG_NOTE, ",%T", TREE_TYPE (op4));
3145           dump_printf (MSG_NOTE, "\n");
3146         }
3147
3148       return NULL;
3149     }
3150
3151   /* Check whether it's ok to change the order of the computation.
3152      Generally, when vectorizing a reduction we change the order of the
3153      computation.  This may change the behavior of the program in some
3154      cases, so we need to check that this is ok.  One exception is when
3155      vectorizing an outer-loop: the inner-loop is executed sequentially,
3156      and therefore vectorizing reductions in the inner-loop during
3157      outer-loop vectorization is safe.  */
3158   if (check_reduction
3159       && *v_reduc_type == TREE_CODE_REDUCTION
3160       && needs_fold_left_reduction_p (type, code,
3161                                       need_wrapping_integral_overflow))
3162     *v_reduc_type = FOLD_LEFT_REDUCTION;
3163
3164   /* Reduction is safe. We're dealing with one of the following:
3165      1) integer arithmetic and no trapv
3166      2) floating point arithmetic, and special flags permit this optimization
3167      3) nested cycle (i.e., outer loop vectorization).  */
3168   stmt_vec_info def1_info = loop_info->lookup_def (op1);
3169   stmt_vec_info def2_info = loop_info->lookup_def (op2);
3170   if (code != COND_EXPR && !def1_info && !def2_info)
3171     {
3172       if (dump_enabled_p ())
3173         report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
3174       return NULL;
3175     }
3176
3177   /* Check that one def is the reduction def, defined by PHI,
3178      the other def is either defined in the loop ("vect_internal_def"),
3179      or it's an induction (defined by a loop-header phi-node).  */
3180
3181   if (def2_info
3182       && def2_info->stmt == phi
3183       && (code == COND_EXPR
3184           || !def1_info
3185           || !flow_bb_inside_loop_p (loop, gimple_bb (def1_info->stmt))
3186           || vect_valid_reduction_input_p (def1_info)))
3187     {
3188       if (dump_enabled_p ())
3189         report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3190       return def_stmt_info;
3191     }
3192
3193   if (def1_info
3194       && def1_info->stmt == phi
3195       && (code == COND_EXPR
3196           || !def2_info
3197           || !flow_bb_inside_loop_p (loop, gimple_bb (def2_info->stmt))
3198           || vect_valid_reduction_input_p (def2_info)))
3199     {
3200       if (! nested_in_vect_loop && orig_code != MINUS_EXPR)
3201         {
3202           /* Check if we can swap operands (just for simplicity - so that
3203              the rest of the code can assume that the reduction variable
3204              is always the last (second) argument).  */
3205           if (code == COND_EXPR)
3206             {
3207               /* Swap cond_expr by inverting the condition.  */
3208               tree cond_expr = gimple_assign_rhs1 (def_stmt);
3209               enum tree_code invert_code = ERROR_MARK;
3210               enum tree_code cond_code = TREE_CODE (cond_expr);
3211
3212               if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
3213                 {
3214                   bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
3215                   invert_code = invert_tree_comparison (cond_code, honor_nans);
3216                 }
3217               if (invert_code != ERROR_MARK)
3218                 {
3219                   TREE_SET_CODE (cond_expr, invert_code);
3220                   swap_ssa_operands (def_stmt,
3221                                      gimple_assign_rhs2_ptr (def_stmt),
3222                                      gimple_assign_rhs3_ptr (def_stmt));
3223                 }
3224               else
3225                 {
3226                   if (dump_enabled_p ())
3227                     report_vect_op (MSG_NOTE, def_stmt,
3228                                     "detected reduction: cannot swap operands "
3229                                     "for cond_expr");
3230                   return NULL;
3231                 }
3232             }
3233           else
3234             swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
3235                                gimple_assign_rhs2_ptr (def_stmt));
3236
3237           if (dump_enabled_p ())
3238             report_vect_op (MSG_NOTE, def_stmt,
3239                             "detected reduction: need to swap operands: ");
3240
3241           if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
3242             LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
3243         }
3244       else
3245         {
3246           if (dump_enabled_p ())
3247             report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3248         }
3249
3250       return def_stmt_info;
3251     }
3252
3253   /* Try to find SLP reduction chain.  */
3254   if (! nested_in_vect_loop
3255       && code != COND_EXPR
3256       && orig_code != MINUS_EXPR
3257       && vect_is_slp_reduction (loop_info, phi, def_stmt))
3258     {
3259       if (dump_enabled_p ())
3260         report_vect_op (MSG_NOTE, def_stmt,
3261                         "reduction: detected reduction chain: ");
3262
3263       return def_stmt_info;
3264     }
3265
3266   /* Look for the expression computing loop_arg from loop PHI result.  */
3267   if (check_reduction_path (vect_location, loop, phi, loop_arg, code))
3268     return def_stmt_info;
3269
3270   if (dump_enabled_p ())
3271     {
3272       report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3273                       "reduction: unknown pattern: ");
3274     }
3275
3276   return NULL;
3277 }
3278
3279 /* Wrapper around vect_is_simple_reduction, which will modify code
3280    in-place if it enables detection of more reductions.  Arguments
3281    as there.  */
3282
3283 stmt_vec_info
3284 vect_force_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3285                              bool *double_reduc,
3286                              bool need_wrapping_integral_overflow)
3287 {
3288   enum vect_reduction_type v_reduc_type;
3289   stmt_vec_info def_info
3290     = vect_is_simple_reduction (loop_info, phi_info, double_reduc,
3291                                 need_wrapping_integral_overflow,
3292                                 &v_reduc_type);
3293   if (def_info)
3294     {
3295       STMT_VINFO_REDUC_TYPE (phi_info) = v_reduc_type;
3296       STMT_VINFO_REDUC_DEF (phi_info) = def_info;
3297       STMT_VINFO_REDUC_TYPE (def_info) = v_reduc_type;
3298       STMT_VINFO_REDUC_DEF (def_info) = phi_info;
3299     }
3300   return def_info;
3301 }
3302
3303 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
3304 int
3305 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3306                              int *peel_iters_epilogue,
3307                              stmt_vector_for_cost *scalar_cost_vec,
3308                              stmt_vector_for_cost *prologue_cost_vec,
3309                              stmt_vector_for_cost *epilogue_cost_vec)
3310 {
3311   int retval = 0;
3312   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3313
3314   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3315     {
3316       *peel_iters_epilogue = assumed_vf / 2;
3317       if (dump_enabled_p ())
3318         dump_printf_loc (MSG_NOTE, vect_location,
3319                          "cost model: epilogue peel iters set to vf/2 "
3320                          "because loop iterations are unknown .\n");
3321
3322       /* If peeled iterations are known but number of scalar loop
3323          iterations are unknown, count a taken branch per peeled loop.  */
3324       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3325                                  NULL, 0, vect_prologue);
3326       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3327                                  NULL, 0, vect_epilogue);
3328     }
3329   else
3330     {
3331       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3332       peel_iters_prologue = niters < peel_iters_prologue ?
3333                             niters : peel_iters_prologue;
3334       *peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3335       /* If we need to peel for gaps, but no peeling is required, we have to
3336          peel VF iterations.  */
3337       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3338         *peel_iters_epilogue = assumed_vf;
3339     }
3340
3341   stmt_info_for_cost *si;
3342   int j;
3343   if (peel_iters_prologue)
3344     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3345       retval += record_stmt_cost (prologue_cost_vec,
3346                                   si->count * peel_iters_prologue,
3347                                   si->kind, si->stmt_info, si->misalign,
3348                                   vect_prologue);
3349   if (*peel_iters_epilogue)
3350     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3351       retval += record_stmt_cost (epilogue_cost_vec,
3352                                   si->count * *peel_iters_epilogue,
3353                                   si->kind, si->stmt_info, si->misalign,
3354                                   vect_epilogue);
3355
3356   return retval;
3357 }
3358
3359 /* Function vect_estimate_min_profitable_iters
3360
3361    Return the number of iterations required for the vector version of the
3362    loop to be profitable relative to the cost of the scalar version of the
3363    loop.
3364
3365    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3366    of iterations for vectorization.  -1 value means loop vectorization
3367    is not profitable.  This returned value may be used for dynamic
3368    profitability check.
3369
3370    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3371    for static check against estimated number of iterations.  */
3372
3373 static void
3374 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3375                                     int *ret_min_profitable_niters,
3376                                     int *ret_min_profitable_estimate)
3377 {
3378   int min_profitable_iters;
3379   int min_profitable_estimate;
3380   int peel_iters_prologue;
3381   int peel_iters_epilogue;
3382   unsigned vec_inside_cost = 0;
3383   int vec_outside_cost = 0;
3384   unsigned vec_prologue_cost = 0;
3385   unsigned vec_epilogue_cost = 0;
3386   int scalar_single_iter_cost = 0;
3387   int scalar_outside_cost = 0;
3388   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3389   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3390   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3391
3392   /* Cost model disabled.  */
3393   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3394     {
3395       if (dump_enabled_p ())
3396         dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3397       *ret_min_profitable_niters = 0;
3398       *ret_min_profitable_estimate = 0;
3399       return;
3400     }
3401
3402   /* Requires loop versioning tests to handle misalignment.  */
3403   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3404     {
3405       /*  FIXME: Make cost depend on complexity of individual check.  */
3406       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3407       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3408                             vect_prologue);
3409       if (dump_enabled_p ())
3410         dump_printf (MSG_NOTE,
3411                      "cost model: Adding cost of checks for loop "
3412                      "versioning to treat misalignment.\n");
3413     }
3414
3415   /* Requires loop versioning with alias checks.  */
3416   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3417     {
3418       /*  FIXME: Make cost depend on complexity of individual check.  */
3419       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3420       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3421                             vect_prologue);
3422       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3423       if (len)
3424         /* Count LEN - 1 ANDs and LEN comparisons.  */
3425         (void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt,
3426                               NULL, 0, vect_prologue);
3427       len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3428       if (len)
3429         {
3430           /* Count LEN - 1 ANDs and LEN comparisons.  */
3431           unsigned int nstmts = len * 2 - 1;
3432           /* +1 for each bias that needs adding.  */
3433           for (unsigned int i = 0; i < len; ++i)
3434             if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3435               nstmts += 1;
3436           (void) add_stmt_cost (target_cost_data, nstmts, scalar_stmt,
3437                                 NULL, 0, vect_prologue);
3438         }
3439       if (dump_enabled_p ())
3440         dump_printf (MSG_NOTE,
3441                      "cost model: Adding cost of checks for loop "
3442                      "versioning aliasing.\n");
3443     }
3444
3445   /* Requires loop versioning with niter checks.  */
3446   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3447     {
3448       /*  FIXME: Make cost depend on complexity of individual check.  */
3449       (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3450                             vect_prologue);
3451       if (dump_enabled_p ())
3452         dump_printf (MSG_NOTE,
3453                      "cost model: Adding cost of checks for loop "
3454                      "versioning niters.\n");
3455     }
3456
3457   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3458     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3459                           vect_prologue);
3460
3461   /* Count statements in scalar loop.  Using this as scalar cost for a single
3462      iteration for now.
3463
3464      TODO: Add outer loop support.
3465
3466      TODO: Consider assigning different costs to different scalar
3467      statements.  */
3468
3469   scalar_single_iter_cost
3470     = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3471
3472   /* Add additional cost for the peeled instructions in prologue and epilogue
3473      loop.  (For fully-masked loops there will be no peeling.)
3474
3475      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3476      at compile-time - we assume it's vf/2 (the worst would be vf-1).
3477
3478      TODO: Build an expression that represents peel_iters for prologue and
3479      epilogue to be used in a run-time test.  */
3480
3481   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3482     {
3483       peel_iters_prologue = 0;
3484       peel_iters_epilogue = 0;
3485
3486       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3487         {
3488           /* We need to peel exactly one iteration.  */
3489           peel_iters_epilogue += 1;
3490           stmt_info_for_cost *si;
3491           int j;
3492           FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
3493                             j, si)
3494             (void) add_stmt_cost (target_cost_data, si->count,
3495                                   si->kind, si->stmt_info, si->misalign,
3496                                   vect_epilogue);
3497         }
3498     }
3499   else if (npeel < 0)
3500     {
3501       peel_iters_prologue = assumed_vf / 2;
3502       if (dump_enabled_p ())
3503         dump_printf (MSG_NOTE, "cost model: "
3504                      "prologue peel iters set to vf/2.\n");
3505
3506       /* If peeling for alignment is unknown, loop bound of main loop becomes
3507          unknown.  */
3508       peel_iters_epilogue = assumed_vf / 2;
3509       if (dump_enabled_p ())
3510         dump_printf (MSG_NOTE, "cost model: "
3511                      "epilogue peel iters set to vf/2 because "
3512                      "peeling for alignment is unknown.\n");
3513
3514       /* If peeled iterations are unknown, count a taken branch and a not taken
3515          branch per peeled loop. Even if scalar loop iterations are known,
3516          vector iterations are not known since peeled prologue iterations are
3517          not known. Hence guards remain the same.  */
3518       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3519                             NULL, 0, vect_prologue);
3520       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3521                             NULL, 0, vect_prologue);
3522       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3523                             NULL, 0, vect_epilogue);
3524       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3525                             NULL, 0, vect_epilogue);
3526       stmt_info_for_cost *si;
3527       int j;
3528       FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3529         {
3530           (void) add_stmt_cost (target_cost_data,
3531                                 si->count * peel_iters_prologue,
3532                                 si->kind, si->stmt_info, si->misalign,
3533                                 vect_prologue);
3534           (void) add_stmt_cost (target_cost_data,
3535                                 si->count * peel_iters_epilogue,
3536                                 si->kind, si->stmt_info, si->misalign,
3537                                 vect_epilogue);
3538         }
3539     }
3540   else
3541     {
3542       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3543       stmt_info_for_cost *si;
3544       int j;
3545       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3546
3547       prologue_cost_vec.create (2);
3548       epilogue_cost_vec.create (2);
3549       peel_iters_prologue = npeel;
3550
3551       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3552                                           &peel_iters_epilogue,
3553                                           &LOOP_VINFO_SCALAR_ITERATION_COST
3554                                             (loop_vinfo),
3555                                           &prologue_cost_vec,
3556                                           &epilogue_cost_vec);
3557
3558       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3559         (void) add_stmt_cost (data, si->count, si->kind, si->stmt_info,
3560                               si->misalign, vect_prologue);
3561
3562       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3563         (void) add_stmt_cost (data, si->count, si->kind, si->stmt_info,
3564                               si->misalign, vect_epilogue);
3565
3566       prologue_cost_vec.release ();
3567       epilogue_cost_vec.release ();
3568     }
3569
3570   /* FORNOW: The scalar outside cost is incremented in one of the
3571      following ways:
3572
3573      1. The vectorizer checks for alignment and aliasing and generates
3574      a condition that allows dynamic vectorization.  A cost model
3575      check is ANDED with the versioning condition.  Hence scalar code
3576      path now has the added cost of the versioning check.
3577
3578        if (cost > th & versioning_check)
3579          jmp to vector code
3580
3581      Hence run-time scalar is incremented by not-taken branch cost.
3582
3583      2. The vectorizer then checks if a prologue is required.  If the
3584      cost model check was not done before during versioning, it has to
3585      be done before the prologue check.
3586
3587        if (cost <= th)
3588          prologue = scalar_iters
3589        if (prologue == 0)
3590          jmp to vector code
3591        else
3592          execute prologue
3593        if (prologue == num_iters)
3594          go to exit
3595
3596      Hence the run-time scalar cost is incremented by a taken branch,
3597      plus a not-taken branch, plus a taken branch cost.
3598
3599      3. The vectorizer then checks if an epilogue is required.  If the
3600      cost model check was not done before during prologue check, it
3601      has to be done with the epilogue check.
3602
3603        if (prologue == 0)
3604          jmp to vector code
3605        else
3606          execute prologue
3607        if (prologue == num_iters)
3608          go to exit
3609        vector code:
3610          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3611            jmp to epilogue
3612
3613      Hence the run-time scalar cost should be incremented by 2 taken
3614      branches.
3615
3616      TODO: The back end may reorder the BBS's differently and reverse
3617      conditions/branch directions.  Change the estimates below to
3618      something more reasonable.  */
3619
3620   /* If the number of iterations is known and we do not do versioning, we can
3621      decide whether to vectorize at compile time.  Hence the scalar version
3622      do not carry cost model guard costs.  */
3623   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3624       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3625     {
3626       /* Cost model check occurs at versioning.  */
3627       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3628         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3629       else
3630         {
3631           /* Cost model check occurs at prologue generation.  */
3632           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3633             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3634               + vect_get_stmt_cost (cond_branch_not_taken);
3635           /* Cost model check occurs at epilogue generation.  */
3636           else
3637             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3638         }
3639     }
3640
3641   /* Complete the target-specific cost calculations.  */
3642   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3643                &vec_inside_cost, &vec_epilogue_cost);
3644
3645   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3646
3647   if (dump_enabled_p ())
3648     {
3649       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3650       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
3651                    vec_inside_cost);
3652       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
3653                    vec_prologue_cost);
3654       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
3655                    vec_epilogue_cost);
3656       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
3657                    scalar_single_iter_cost);
3658       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
3659                    scalar_outside_cost);
3660       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
3661                    vec_outside_cost);
3662       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
3663                    peel_iters_prologue);
3664       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
3665                    peel_iters_epilogue);
3666     }
3667
3668   /* Calculate number of iterations required to make the vector version
3669      profitable, relative to the loop bodies only.  The following condition
3670      must hold true:
3671      SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
3672      where
3673      SIC = scalar iteration cost, VIC = vector iteration cost,
3674      VOC = vector outside cost, VF = vectorization factor,
3675      NPEEL = prologue iterations + epilogue iterations,
3676      SOC = scalar outside cost for run time cost model check.  */
3677
3678   int saving_per_viter = (scalar_single_iter_cost * assumed_vf
3679                           - vec_inside_cost);
3680   if (saving_per_viter <= 0)
3681     {
3682       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3683         warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
3684                     "vectorization did not happen for a simd loop");
3685
3686       if (dump_enabled_p ())
3687         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3688                          "cost model: the vector iteration cost = %d "
3689                          "divided by the scalar iteration cost = %d "
3690                          "is greater or equal to the vectorization factor = %d"
3691                          ".\n",
3692                          vec_inside_cost, scalar_single_iter_cost, assumed_vf);
3693       *ret_min_profitable_niters = -1;
3694       *ret_min_profitable_estimate = -1;
3695       return;
3696     }
3697
3698   /* ??? The "if" arm is written to handle all cases; see below for what
3699      we would do for !LOOP_VINFO_FULLY_MASKED_P.  */
3700   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3701     {
3702       /* Rewriting the condition above in terms of the number of
3703          vector iterations (vniters) rather than the number of
3704          scalar iterations (niters) gives:
3705
3706          SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
3707
3708          <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
3709
3710          For integer N, X and Y when X > 0:
3711
3712          N * X > Y <==> N >= (Y /[floor] X) + 1.  */
3713       int outside_overhead = (vec_outside_cost
3714                               - scalar_single_iter_cost * peel_iters_prologue
3715                               - scalar_single_iter_cost * peel_iters_epilogue
3716                               - scalar_outside_cost);
3717       /* We're only interested in cases that require at least one
3718          vector iteration.  */
3719       int min_vec_niters = 1;
3720       if (outside_overhead > 0)
3721         min_vec_niters = outside_overhead / saving_per_viter + 1;
3722
3723       if (dump_enabled_p ())
3724         dump_printf (MSG_NOTE, "  Minimum number of vector iterations: %d\n",
3725                      min_vec_niters);
3726
3727       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3728         {
3729           /* Now that we know the minimum number of vector iterations,
3730              find the minimum niters for which the scalar cost is larger:
3731
3732              SIC * niters > VIC * vniters + VOC - SOC
3733
3734              We know that the minimum niters is no more than
3735              vniters * VF + NPEEL, but it might be (and often is) less
3736              than that if a partial vector iteration is cheaper than the
3737              equivalent scalar code.  */
3738           int threshold = (vec_inside_cost * min_vec_niters
3739                            + vec_outside_cost
3740                            - scalar_outside_cost);
3741           if (threshold <= 0)
3742             min_profitable_iters = 1;
3743           else
3744             min_profitable_iters = threshold / scalar_single_iter_cost + 1;
3745         }
3746       else
3747         /* Convert the number of vector iterations into a number of
3748            scalar iterations.  */
3749         min_profitable_iters = (min_vec_niters * assumed_vf
3750                                 + peel_iters_prologue
3751                                 + peel_iters_epilogue);
3752     }
3753   else
3754     {
3755       min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
3756                               * assumed_vf
3757                               - vec_inside_cost * peel_iters_prologue
3758                               - vec_inside_cost * peel_iters_epilogue);
3759       if (min_profitable_iters <= 0)
3760         min_profitable_iters = 0;
3761       else
3762         {
3763           min_profitable_iters /= saving_per_viter;
3764
3765           if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
3766               <= (((int) vec_inside_cost * min_profitable_iters)
3767                   + (((int) vec_outside_cost - scalar_outside_cost)
3768                      * assumed_vf)))
3769             min_profitable_iters++;
3770         }
3771     }
3772
3773   if (dump_enabled_p ())
3774     dump_printf (MSG_NOTE,
3775                  "  Calculated minimum iters for profitability: %d\n",
3776                  min_profitable_iters);
3777
3778   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
3779       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
3780     /* We want the vectorized loop to execute at least once.  */
3781     min_profitable_iters = assumed_vf + peel_iters_prologue;
3782
3783   if (dump_enabled_p ())
3784     dump_printf_loc (MSG_NOTE, vect_location,
3785                      "  Runtime profitability threshold = %d\n",
3786                      min_profitable_iters);
3787
3788   *ret_min_profitable_niters = min_profitable_iters;
3789
3790   /* Calculate number of iterations required to make the vector version
3791      profitable, relative to the loop bodies only.
3792
3793      Non-vectorized variant is SIC * niters and it must win over vector
3794      variant on the expected loop trip count.  The following condition must hold true:
3795      SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC  */
3796
3797   if (vec_outside_cost <= 0)
3798     min_profitable_estimate = 0;
3799   else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3800     {
3801       /* This is a repeat of the code above, but with + SOC rather
3802          than - SOC.  */
3803       int outside_overhead = (vec_outside_cost
3804                               - scalar_single_iter_cost * peel_iters_prologue
3805                               - scalar_single_iter_cost * peel_iters_epilogue
3806                               + scalar_outside_cost);
3807       int min_vec_niters = 1;
3808       if (outside_overhead > 0)
3809         min_vec_niters = outside_overhead / saving_per_viter + 1;
3810
3811       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3812         {
3813           int threshold = (vec_inside_cost * min_vec_niters
3814                            + vec_outside_cost
3815                            + scalar_outside_cost);
3816           min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
3817         }
3818       else
3819         min_profitable_estimate = (min_vec_niters * assumed_vf
3820                                    + peel_iters_prologue
3821                                    + peel_iters_epilogue);
3822     }
3823   else
3824     {
3825       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
3826                                  * assumed_vf
3827                                  - vec_inside_cost * peel_iters_prologue
3828                                  - vec_inside_cost * peel_iters_epilogue)
3829                                  / ((scalar_single_iter_cost * assumed_vf)
3830                                    - vec_inside_cost);
3831     }
3832   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3833   if (dump_enabled_p ())
3834     dump_printf_loc (MSG_NOTE, vect_location,
3835                      "  Static estimate profitability threshold = %d\n",
3836                      min_profitable_estimate);
3837
3838   *ret_min_profitable_estimate = min_profitable_estimate;
3839 }
3840
3841 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3842    vector elements (not bits) for a vector with NELT elements.  */
3843 static void
3844 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
3845                               vec_perm_builder *sel)
3846 {
3847   /* The encoding is a single stepped pattern.  Any wrap-around is handled
3848      by vec_perm_indices.  */
3849   sel->new_vector (nelt, 1, 3);
3850   for (unsigned int i = 0; i < 3; i++)
3851     sel->quick_push (i + offset);
3852 }
3853
3854 /* Checks whether the target supports whole-vector shifts for vectors of mode
3855    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
3856    it supports vec_perm_const with masks for all necessary shift amounts.  */
3857 static bool
3858 have_whole_vector_shift (machine_mode mode)
3859 {
3860   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3861     return true;
3862
3863   /* Variable-length vectors should be handled via the optab.  */
3864   unsigned int nelt;
3865   if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
3866     return false;
3867
3868   vec_perm_builder sel;
3869   vec_perm_indices indices;
3870   for (unsigned int i = nelt / 2; i >= 1; i /= 2)
3871     {
3872       calc_vec_perm_mask_for_shift (i, nelt, &sel);
3873       indices.new_vector (sel, 2, nelt);
3874       if (!can_vec_perm_const_p (mode, indices, false))
3875         return false;
3876     }
3877   return true;
3878 }
3879
3880 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3881    functions. Design better to avoid maintenance issues.  */
3882
3883 /* Function vect_model_reduction_cost.
3884
3885    Models cost for a reduction operation, including the vector ops
3886    generated within the strip-mine loop, the initial definition before
3887    the loop, and the epilogue code that must be generated.  */
3888
3889 static void
3890 vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
3891                            int ncopies, stmt_vector_for_cost *cost_vec)
3892 {
3893   int prologue_cost = 0, epilogue_cost = 0, inside_cost;
3894   enum tree_code code;
3895   optab optab;
3896   tree vectype;
3897   machine_mode mode;
3898   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3899   struct loop *loop = NULL;
3900
3901   if (loop_vinfo)
3902     loop = LOOP_VINFO_LOOP (loop_vinfo);
3903
3904   /* Condition reductions generate two reductions in the loop.  */
3905   vect_reduction_type reduction_type
3906     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
3907   if (reduction_type == COND_REDUCTION)
3908     ncopies *= 2;
3909
3910   vectype = STMT_VINFO_VECTYPE (stmt_info);
3911   mode = TYPE_MODE (vectype);
3912   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
3913
3914   code = gimple_assign_rhs_code (orig_stmt_info->stmt);
3915
3916   if (reduction_type == EXTRACT_LAST_REDUCTION
3917       || reduction_type == FOLD_LEFT_REDUCTION)
3918     {
3919       /* No extra instructions needed in the prologue.  */
3920       prologue_cost = 0;
3921
3922       if (reduction_type == EXTRACT_LAST_REDUCTION || reduc_fn != IFN_LAST)
3923         /* Count one reduction-like operation per vector.  */
3924         inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
3925                                         stmt_info, 0, vect_body);
3926       else
3927         {
3928           /* Use NELEMENTS extracts and NELEMENTS scalar ops.  */
3929           unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
3930           inside_cost = record_stmt_cost (cost_vec, nelements,
3931                                           vec_to_scalar, stmt_info, 0,
3932                                           vect_body);
3933           inside_cost += record_stmt_cost (cost_vec, nelements,
3934                                            scalar_stmt, stmt_info, 0,
3935                                            vect_body);
3936         }
3937     }
3938   else
3939     {
3940       /* Add in cost for initial definition.
3941          For cond reduction we have four vectors: initial index, step,
3942          initial result of the data reduction, initial value of the index
3943          reduction.  */
3944       int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
3945       prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
3946                                          scalar_to_vec, stmt_info, 0,
3947                                          vect_prologue);
3948
3949       /* Cost of reduction op inside loop.  */
3950       inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
3951                                       stmt_info, 0, vect_body);
3952     }
3953
3954   /* Determine cost of epilogue code.
3955
3956      We have a reduction operator that will reduce the vector in one statement.
3957      Also requires scalar extract.  */
3958
3959   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
3960     {
3961       if (reduc_fn != IFN_LAST)
3962         {
3963           if (reduction_type == COND_REDUCTION)
3964             {
3965               /* An EQ stmt and an COND_EXPR stmt.  */
3966               epilogue_cost += record_stmt_cost (cost_vec, 2,
3967                                                  vector_stmt, stmt_info, 0,
3968                                                  vect_epilogue);
3969               /* Reduction of the max index and a reduction of the found
3970                  values.  */
3971               epilogue_cost += record_stmt_cost (cost_vec, 2,
3972                                                  vec_to_scalar, stmt_info, 0,
3973                                                  vect_epilogue);
3974               /* A broadcast of the max value.  */
3975               epilogue_cost += record_stmt_cost (cost_vec, 1,
3976                                                  scalar_to_vec, stmt_info, 0,
3977                                                  vect_epilogue);
3978             }
3979           else
3980             {
3981               epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
3982                                                  stmt_info, 0, vect_epilogue);
3983               epilogue_cost += record_stmt_cost (cost_vec, 1,
3984                                                  vec_to_scalar, stmt_info, 0,
3985                                                  vect_epilogue);
3986             }
3987         }
3988       else if (reduction_type == COND_REDUCTION)
3989         {
3990           unsigned estimated_nunits = vect_nunits_for_cost (vectype);
3991           /* Extraction of scalar elements.  */
3992           epilogue_cost += record_stmt_cost (cost_vec,
3993                                              2 * estimated_nunits,
3994                                              vec_to_scalar, stmt_info, 0,
3995                                              vect_epilogue);
3996           /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
3997           epilogue_cost += record_stmt_cost (cost_vec,
3998                                              2 * estimated_nunits - 3,
3999                                              scalar_stmt, stmt_info, 0,
4000                                              vect_epilogue);
4001         }
4002       else if (reduction_type == EXTRACT_LAST_REDUCTION
4003                || reduction_type == FOLD_LEFT_REDUCTION)
4004         /* No extra instructions need in the epilogue.  */
4005         ;
4006       else
4007         {
4008           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4009           tree bitsize =
4010             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt_info->stmt)));
4011           int element_bitsize = tree_to_uhwi (bitsize);
4012           int nelements = vec_size_in_bits / element_bitsize;
4013
4014           if (code == COND_EXPR)
4015             code = MAX_EXPR;
4016
4017           optab = optab_for_tree_code (code, vectype, optab_default);
4018
4019           /* We have a whole vector shift available.  */
4020           if (optab != unknown_optab
4021               && VECTOR_MODE_P (mode)
4022               && optab_handler (optab, mode) != CODE_FOR_nothing
4023               && have_whole_vector_shift (mode))
4024             {
4025               /* Final reduction via vector shifts and the reduction operator.
4026                  Also requires scalar extract.  */
4027               epilogue_cost += record_stmt_cost (cost_vec,
4028                                                  exact_log2 (nelements) * 2,
4029                                                  vector_stmt, stmt_info, 0,
4030                                                  vect_epilogue);
4031               epilogue_cost += record_stmt_cost (cost_vec, 1,
4032                                                  vec_to_scalar, stmt_info, 0,
4033                                                  vect_epilogue);
4034             }
4035           else
4036             /* Use extracts and reduction op for final reduction.  For N
4037                elements, we have N extracts and N-1 reduction ops.  */
4038             epilogue_cost += record_stmt_cost (cost_vec,
4039                                                nelements + nelements - 1,
4040                                                vector_stmt, stmt_info, 0,
4041                                                vect_epilogue);
4042         }
4043     }
4044
4045   if (dump_enabled_p ())
4046     dump_printf (MSG_NOTE,
4047                  "vect_model_reduction_cost: inside_cost = %d, "
4048                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4049                  prologue_cost, epilogue_cost);
4050 }
4051
4052
4053 /* Function vect_model_induction_cost.
4054
4055    Models cost for induction operations.  */
4056
4057 static void
4058 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies,
4059                            stmt_vector_for_cost *cost_vec)
4060 {
4061   unsigned inside_cost, prologue_cost;
4062
4063   if (PURE_SLP_STMT (stmt_info))
4064     return;
4065
4066   /* loop cost for vec_loop.  */
4067   inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
4068                                   stmt_info, 0, vect_body);
4069
4070   /* prologue cost for vec_init and vec_step.  */
4071   prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
4072                                     stmt_info, 0, vect_prologue);
4073
4074   if (dump_enabled_p ())
4075     dump_printf_loc (MSG_NOTE, vect_location,
4076                      "vect_model_induction_cost: inside_cost = %d, "
4077                      "prologue_cost = %d .\n", inside_cost, prologue_cost);
4078 }
4079
4080
4081
4082 /* Function get_initial_def_for_reduction
4083
4084    Input:
4085    STMT_VINFO - a stmt that performs a reduction operation in the loop.
4086    INIT_VAL - the initial value of the reduction variable
4087
4088    Output:
4089    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4090         of the reduction (used for adjusting the epilog - see below).
4091    Return a vector variable, initialized according to the operation that
4092         STMT_VINFO performs. This vector will be used as the initial value
4093         of the vector of partial results.
4094
4095    Option1 (adjust in epilog): Initialize the vector as follows:
4096      add/bit or/xor:    [0,0,...,0,0]
4097      mult/bit and:      [1,1,...,1,1]
4098      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4099    and when necessary (e.g. add/mult case) let the caller know
4100    that it needs to adjust the result by init_val.
4101
4102    Option2: Initialize the vector as follows:
4103      add/bit or/xor:    [init_val,0,0,...,0]
4104      mult/bit and:      [init_val,1,1,...,1]
4105      min/max/cond_expr: [init_val,init_val,...,init_val]
4106    and no adjustments are needed.
4107
4108    For example, for the following code:
4109
4110    s = init_val;
4111    for (i=0;i<n;i++)
4112      s = s + a[i];
4113
4114    STMT_VINFO is 's = s + a[i]', and the reduction variable is 's'.
4115    For a vector of 4 units, we want to return either [0,0,0,init_val],
4116    or [0,0,0,0] and let the caller know that it needs to adjust
4117    the result at the end by 'init_val'.
4118
4119    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4120    initialization vector is simpler (same element in all entries), if
4121    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4122
4123    A cost model should help decide between these two schemes.  */
4124
4125 tree
4126 get_initial_def_for_reduction (stmt_vec_info stmt_vinfo, tree init_val,
4127                                tree *adjustment_def)
4128 {
4129   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
4130   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4131   tree scalar_type = TREE_TYPE (init_val);
4132   tree vectype = get_vectype_for_scalar_type (scalar_type);
4133   enum tree_code code = gimple_assign_rhs_code (stmt_vinfo->stmt);
4134   tree def_for_init;
4135   tree init_def;
4136   REAL_VALUE_TYPE real_init_val = dconst0;
4137   int int_init_val = 0;
4138   gimple_seq stmts = NULL;
4139
4140   gcc_assert (vectype);
4141
4142   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4143               || SCALAR_FLOAT_TYPE_P (scalar_type));
4144
4145   gcc_assert (nested_in_vect_loop_p (loop, stmt_vinfo)
4146               || loop == (gimple_bb (stmt_vinfo->stmt))->loop_father);
4147
4148   vect_reduction_type reduction_type
4149     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo);
4150
4151   switch (code)
4152     {
4153     case WIDEN_SUM_EXPR:
4154     case DOT_PROD_EXPR:
4155     case SAD_EXPR:
4156     case PLUS_EXPR:
4157     case MINUS_EXPR:
4158     case BIT_IOR_EXPR:
4159     case BIT_XOR_EXPR:
4160     case MULT_EXPR:
4161     case BIT_AND_EXPR:
4162       {
4163         /* ADJUSTMENT_DEF is NULL when called from
4164            vect_create_epilog_for_reduction to vectorize double reduction.  */
4165         if (adjustment_def)
4166           *adjustment_def = init_val;
4167
4168         if (code == MULT_EXPR)
4169           {
4170             real_init_val = dconst1;
4171             int_init_val = 1;
4172           }
4173
4174         if (code == BIT_AND_EXPR)
4175           int_init_val = -1;
4176
4177         if (SCALAR_FLOAT_TYPE_P (scalar_type))
4178           def_for_init = build_real (scalar_type, real_init_val);
4179         else
4180           def_for_init = build_int_cst (scalar_type, int_init_val);
4181
4182         if (adjustment_def)
4183           /* Option1: the first element is '0' or '1' as well.  */
4184           init_def = gimple_build_vector_from_val (&stmts, vectype,
4185                                                    def_for_init);
4186         else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4187           {
4188             /* Option2 (variable length): the first element is INIT_VAL.  */
4189             init_def = gimple_build_vector_from_val (&stmts, vectype,
4190                                                      def_for_init);
4191             init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4192                                      vectype, init_def, init_val);
4193           }
4194         else
4195           {
4196             /* Option2: the first element is INIT_VAL.  */
4197             tree_vector_builder elts (vectype, 1, 2);
4198             elts.quick_push (init_val);
4199             elts.quick_push (def_for_init);
4200             init_def = gimple_build_vector (&stmts, &elts);
4201           }
4202       }
4203       break;
4204
4205     case MIN_EXPR:
4206     case MAX_EXPR:
4207     case COND_EXPR:
4208       {
4209         if (adjustment_def)
4210           {
4211             *adjustment_def = NULL_TREE;
4212             if (reduction_type != COND_REDUCTION
4213                 && reduction_type != EXTRACT_LAST_REDUCTION)
4214               {
4215                 init_def = vect_get_vec_def_for_operand (init_val, stmt_vinfo);
4216                 break;
4217               }
4218           }
4219         init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4220         init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4221       }
4222       break;
4223
4224     default:
4225       gcc_unreachable ();
4226     }
4227
4228   if (stmts)
4229     gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4230   return init_def;
4231 }
4232
4233 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4234    NUMBER_OF_VECTORS is the number of vector defs to create.
4235    If NEUTRAL_OP is nonnull, introducing extra elements of that
4236    value will not change the result.  */
4237
4238 static void
4239 get_initial_defs_for_reduction (slp_tree slp_node,
4240                                 vec<tree> *vec_oprnds,
4241                                 unsigned int number_of_vectors,
4242                                 bool reduc_chain, tree neutral_op)
4243 {
4244   vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4245   stmt_vec_info stmt_vinfo = stmts[0];
4246   unsigned HOST_WIDE_INT nunits;
4247   unsigned j, number_of_places_left_in_vector;
4248   tree vector_type;
4249   unsigned int group_size = stmts.length ();
4250   unsigned int i;
4251   struct loop *loop;
4252
4253   vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4254
4255   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4256
4257   loop = (gimple_bb (stmt_vinfo->stmt))->loop_father;
4258   gcc_assert (loop);
4259   edge pe = loop_preheader_edge (loop);
4260
4261   gcc_assert (!reduc_chain || neutral_op);
4262
4263   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4264      created vectors. It is greater than 1 if unrolling is performed.
4265
4266      For example, we have two scalar operands, s1 and s2 (e.g., group of
4267      strided accesses of size two), while NUNITS is four (i.e., four scalars
4268      of this type can be packed in a vector).  The output vector will contain
4269      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
4270      will be 2).
4271
4272      If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4273      vectors containing the operands.
4274
4275      For example, NUNITS is four as before, and the group size is 8
4276      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
4277      {s5, s6, s7, s8}.  */
4278
4279   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4280     nunits = group_size;
4281
4282   number_of_places_left_in_vector = nunits;
4283   bool constant_p = true;
4284   tree_vector_builder elts (vector_type, nunits, 1);
4285   elts.quick_grow (nunits);
4286   gimple_seq ctor_seq = NULL;
4287   for (j = 0; j < nunits * number_of_vectors; ++j)
4288     {
4289       tree op;
4290       i = j % group_size;
4291       stmt_vinfo = stmts[i];
4292
4293       /* Get the def before the loop.  In reduction chain we have only
4294          one initial value.  Else we have as many as PHIs in the group.  */
4295       if (reduc_chain)
4296         op = j != 0 ? neutral_op : PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4297       else if (((vec_oprnds->length () + 1) * nunits
4298                 - number_of_places_left_in_vector >= group_size)
4299                && neutral_op)
4300         op = neutral_op;
4301       else
4302         op = PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4303
4304       /* Create 'vect_ = {op0,op1,...,opn}'.  */
4305       number_of_places_left_in_vector--;
4306       elts[nunits - number_of_places_left_in_vector - 1] = op;
4307       if (!CONSTANT_CLASS_P (op))
4308         constant_p = false;
4309
4310       if (number_of_places_left_in_vector == 0)
4311         {
4312           tree init;
4313           if (constant_p && !neutral_op
4314               ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4315               : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4316             /* Build the vector directly from ELTS.  */
4317             init = gimple_build_vector (&ctor_seq, &elts);
4318           else if (neutral_op)
4319             {
4320               /* Build a vector of the neutral value and shift the
4321                  other elements into place.  */
4322               init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4323                                                    neutral_op);
4324               int k = nunits;
4325               while (k > 0 && elts[k - 1] == neutral_op)
4326                 k -= 1;
4327               while (k > 0)
4328                 {
4329                   k -= 1;
4330                   init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4331                                        vector_type, init, elts[k]);
4332                 }
4333             }
4334           else
4335             {
4336               /* First time round, duplicate ELTS to fill the
4337                  required number of vectors.  */
4338               duplicate_and_interleave (&ctor_seq, vector_type, elts,
4339                                         number_of_vectors, *vec_oprnds);
4340               break;
4341             }
4342           vec_oprnds->quick_push (init);
4343
4344           number_of_places_left_in_vector = nunits;
4345           elts.new_vector (vector_type, nunits, 1);
4346           elts.quick_grow (nunits);
4347           constant_p = true;
4348         }
4349     }
4350   if (ctor_seq != NULL)
4351     gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4352 }
4353
4354
4355 /* Function vect_create_epilog_for_reduction
4356
4357    Create code at the loop-epilog to finalize the result of a reduction
4358    computation.
4359
4360    VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4361      reduction statements.
4362    STMT_INFO is the scalar reduction stmt that is being vectorized.
4363    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4364      number of elements that we can fit in a vectype (nunits).  In this case
4365      we have to generate more than one vector stmt - i.e - we need to "unroll"
4366      the vector stmt by a factor VF/nunits.  For more details see documentation
4367      in vectorizable_operation.
4368    REDUC_FN is the internal function for the epilog reduction.
4369    REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4370      computation.
4371    REDUC_INDEX is the index of the operand in the right hand side of the
4372      statement that is defined by REDUCTION_PHI.
4373    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4374    SLP_NODE is an SLP node containing a group of reduction statements. The
4375      first one in this group is STMT_INFO.
4376    INDUC_VAL is for INTEGER_INDUC_COND_REDUCTION the value to use for the case
4377      when the COND_EXPR is never true in the loop.  For MAX_EXPR, it needs to
4378      be smaller than any value of the IV in the loop, for MIN_EXPR larger than
4379      any value of the IV in the loop.
4380    INDUC_CODE is the code for epilog reduction if INTEGER_INDUC_COND_REDUCTION.
4381    NEUTRAL_OP is the value given by neutral_op_for_slp_reduction; it is
4382      null if this is not an SLP reduction
4383
4384    This function:
4385    1. Creates the reduction def-use cycles: sets the arguments for
4386       REDUCTION_PHIS:
4387       The loop-entry argument is the vectorized initial-value of the reduction.
4388       The loop-latch argument is taken from VECT_DEFS - the vector of partial
4389       sums.
4390    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4391       by calling the function specified by REDUC_FN if available, or by
4392       other means (whole-vector shifts or a scalar loop).
4393       The function also creates a new phi node at the loop exit to preserve
4394       loop-closed form, as illustrated below.
4395
4396      The flow at the entry to this function:
4397
4398         loop:
4399           vec_def = phi <null, null>            # REDUCTION_PHI
4400           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
4401           s_loop = scalar_stmt                  # (scalar) STMT_INFO
4402         loop_exit:
4403           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4404           use <s_out0>
4405           use <s_out0>
4406
4407      The above is transformed by this function into:
4408
4409         loop:
4410           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4411           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
4412           s_loop = scalar_stmt                  # (scalar) STMT_INFO
4413         loop_exit:
4414           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4415           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4416           v_out2 = reduce <v_out1>
4417           s_out3 = extract_field <v_out2, 0>
4418           s_out4 = adjust_result <s_out3>
4419           use <s_out4>
4420           use <s_out4>
4421 */
4422
4423 static void
4424 vect_create_epilog_for_reduction (vec<tree> vect_defs,
4425                                   stmt_vec_info stmt_info,
4426                                   gimple *reduc_def_stmt,
4427                                   int ncopies, internal_fn reduc_fn,
4428                                   vec<stmt_vec_info> reduction_phis,
4429                                   bool double_reduc,
4430                                   slp_tree slp_node,
4431                                   slp_instance slp_node_instance,
4432                                   tree induc_val, enum tree_code induc_code,
4433                                   tree neutral_op)
4434 {
4435   stmt_vec_info prev_phi_info;
4436   tree vectype;
4437   machine_mode mode;
4438   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4439   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4440   basic_block exit_bb;
4441   tree scalar_dest;
4442   tree scalar_type;
4443   gimple *new_phi = NULL, *phi;
4444   stmt_vec_info phi_info;
4445   gimple_stmt_iterator exit_gsi;
4446   tree vec_dest;
4447   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4448   gimple *epilog_stmt = NULL;
4449   enum tree_code code = gimple_assign_rhs_code (stmt_info->stmt);
4450   gimple *exit_phi;
4451   tree bitsize;
4452   tree adjustment_def = NULL;
4453   tree vec_initial_def = NULL;
4454   tree expr, def, initial_def = NULL;
4455   tree orig_name, scalar_result;
4456   imm_use_iterator imm_iter, phi_imm_iter;
4457   use_operand_p use_p, phi_use_p;
4458   gimple *use_stmt;
4459   stmt_vec_info reduction_phi_info = NULL;
4460   bool nested_in_vect_loop = false;
4461   auto_vec<gimple *> new_phis;
4462   auto_vec<stmt_vec_info> inner_phis;
4463   int j, i;
4464   auto_vec<tree> scalar_results;
4465   unsigned int group_size = 1, k, ratio;
4466   auto_vec<tree> vec_initial_defs;
4467   auto_vec<gimple *> phis;
4468   bool slp_reduc = false;
4469   bool direct_slp_reduc;
4470   tree new_phi_result;
4471   stmt_vec_info inner_phi = NULL;
4472   tree induction_index = NULL_TREE;
4473
4474   if (slp_node)
4475     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4476
4477   if (nested_in_vect_loop_p (loop, stmt_info))
4478     {
4479       outer_loop = loop;
4480       loop = loop->inner;
4481       nested_in_vect_loop = true;
4482       gcc_assert (!slp_node);
4483     }
4484
4485   vectype = STMT_VINFO_VECTYPE (stmt_info);
4486   gcc_assert (vectype);
4487   mode = TYPE_MODE (vectype);
4488
4489   /* 1. Create the reduction def-use cycle:
4490      Set the arguments of REDUCTION_PHIS, i.e., transform
4491
4492         loop:
4493           vec_def = phi <null, null>            # REDUCTION_PHI
4494           VECT_DEF = vector_stmt                # vectorized form of STMT
4495           ...
4496
4497      into:
4498
4499         loop:
4500           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4501           VECT_DEF = vector_stmt                # vectorized form of STMT
4502           ...
4503
4504      (in case of SLP, do it for all the phis). */
4505
4506   /* Get the loop-entry arguments.  */
4507   enum vect_def_type initial_def_dt = vect_unknown_def_type;
4508   if (slp_node)
4509     {
4510       unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4511       vec_initial_defs.reserve (vec_num);
4512       get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
4513                                       &vec_initial_defs, vec_num,
4514                                       REDUC_GROUP_FIRST_ELEMENT (stmt_info),
4515                                       neutral_op);
4516     }
4517   else
4518     {
4519       /* Get at the scalar def before the loop, that defines the initial value
4520          of the reduction variable.  */
4521       initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4522                                            loop_preheader_edge (loop));
4523       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
4524          and we can't use zero for induc_val, use initial_def.  Similarly
4525          for REDUC_MIN and initial_def larger than the base.  */
4526       if (TREE_CODE (initial_def) == INTEGER_CST
4527           && (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4528               == INTEGER_INDUC_COND_REDUCTION)
4529           && !integer_zerop (induc_val)
4530           && ((induc_code == MAX_EXPR
4531                && tree_int_cst_lt (initial_def, induc_val))
4532               || (induc_code == MIN_EXPR
4533                   && tree_int_cst_lt (induc_val, initial_def))))
4534         induc_val = initial_def;
4535
4536       if (double_reduc)
4537         /* In case of double reduction we only create a vector variable
4538            to be put in the reduction phi node.  The actual statement
4539            creation is done later in this function.  */
4540         vec_initial_def = vect_create_destination_var (initial_def, vectype);
4541       else if (nested_in_vect_loop)
4542         {
4543           /* Do not use an adjustment def as that case is not supported
4544              correctly if ncopies is not one.  */
4545           vect_is_simple_use (initial_def, loop_vinfo, &initial_def_dt);
4546           vec_initial_def = vect_get_vec_def_for_operand (initial_def,
4547                                                           stmt_info);
4548         }
4549       else
4550         vec_initial_def
4551           = get_initial_def_for_reduction (stmt_info, initial_def,
4552                                            &adjustment_def);
4553       vec_initial_defs.create (1);
4554       vec_initial_defs.quick_push (vec_initial_def);
4555     }
4556
4557   /* Set phi nodes arguments.  */
4558   FOR_EACH_VEC_ELT (reduction_phis, i, phi_info)
4559     {
4560       tree vec_init_def = vec_initial_defs[i];
4561       tree def = vect_defs[i];
4562       for (j = 0; j < ncopies; j++)
4563         {
4564           if (j != 0)
4565             {
4566               phi_info = STMT_VINFO_RELATED_STMT (phi_info);
4567               if (nested_in_vect_loop)
4568                 vec_init_def
4569                   = vect_get_vec_def_for_stmt_copy (loop_vinfo, vec_init_def);
4570             }
4571
4572           /* Set the loop-entry arg of the reduction-phi.  */
4573
4574           gphi *phi = as_a <gphi *> (phi_info->stmt);
4575           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4576               == INTEGER_INDUC_COND_REDUCTION)
4577             {
4578               /* Initialise the reduction phi to zero.  This prevents initial
4579                  values of non-zero interferring with the reduction op.  */
4580               gcc_assert (ncopies == 1);
4581               gcc_assert (i == 0);
4582
4583               tree vec_init_def_type = TREE_TYPE (vec_init_def);
4584               tree induc_val_vec
4585                 = build_vector_from_val (vec_init_def_type, induc_val);
4586
4587               add_phi_arg (phi, induc_val_vec, loop_preheader_edge (loop),
4588                            UNKNOWN_LOCATION);
4589             }
4590           else
4591             add_phi_arg (phi, vec_init_def, loop_preheader_edge (loop),
4592                          UNKNOWN_LOCATION);
4593
4594           /* Set the loop-latch arg for the reduction-phi.  */
4595           if (j > 0)
4596             def = vect_get_vec_def_for_stmt_copy (loop_vinfo, def);
4597
4598           add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
4599
4600           if (dump_enabled_p ())
4601             dump_printf_loc (MSG_NOTE, vect_location,
4602                              "transform reduction: created def-use cycle: %G%G",
4603                              phi, SSA_NAME_DEF_STMT (def));
4604         }
4605     }
4606
4607   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4608      which is updated with the current index of the loop for every match of
4609      the original loop's cond_expr (VEC_STMT).  This results in a vector
4610      containing the last time the condition passed for that vector lane.
4611      The first match will be a 1 to allow 0 to be used for non-matching
4612      indexes.  If there are no matches at all then the vector will be all
4613      zeroes.  */
4614   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4615     {
4616       tree indx_before_incr, indx_after_incr;
4617       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4618
4619       gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info)->stmt;
4620       gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4621
4622       int scalar_precision
4623         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4624       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4625       tree cr_index_vector_type = build_vector_type
4626         (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype));
4627
4628       /* First we create a simple vector induction variable which starts
4629          with the values {1,2,3,...} (SERIES_VECT) and increments by the
4630          vector size (STEP).  */
4631
4632       /* Create a {1,2,3,...} vector.  */
4633       tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
4634
4635       /* Create a vector of the step value.  */
4636       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4637       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4638
4639       /* Create an induction variable.  */
4640       gimple_stmt_iterator incr_gsi;
4641       bool insert_after;
4642       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4643       create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4644                  insert_after, &indx_before_incr, &indx_after_incr);
4645
4646       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4647          filled with zeros (VEC_ZERO).  */
4648
4649       /* Create a vector of 0s.  */
4650       tree zero = build_zero_cst (cr_index_scalar_type);
4651       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4652
4653       /* Create a vector phi node.  */
4654       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4655       new_phi = create_phi_node (new_phi_tree, loop->header);
4656       loop_vinfo->add_stmt (new_phi);
4657       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4658                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
4659
4660       /* Now take the condition from the loops original cond_expr
4661          (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
4662          every match uses values from the induction variable
4663          (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4664          (NEW_PHI_TREE).
4665          Finally, we update the phi (NEW_PHI_TREE) to take the value of
4666          the new cond_expr (INDEX_COND_EXPR).  */
4667
4668       /* Duplicate the condition from vec_stmt.  */
4669       tree ccompare = unshare_expr (gimple_assign_rhs1 (vec_stmt));
4670
4671       /* Create a conditional, where the condition is taken from vec_stmt
4672          (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and
4673          else is the phi (NEW_PHI_TREE).  */
4674       tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
4675                                      ccompare, indx_before_incr,
4676                                      new_phi_tree);
4677       induction_index = make_ssa_name (cr_index_vector_type);
4678       gimple *index_condition = gimple_build_assign (induction_index,
4679                                                      index_cond_expr);
4680       gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
4681       stmt_vec_info index_vec_info = loop_vinfo->add_stmt (index_condition);
4682       STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
4683
4684       /* Update the phi with the vec cond.  */
4685       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4686                    loop_latch_edge (loop), UNKNOWN_LOCATION);
4687     }
4688
4689   /* 2. Create epilog code.
4690         The reduction epilog code operates across the elements of the vector
4691         of partial results computed by the vectorized loop.
4692         The reduction epilog code consists of:
4693
4694         step 1: compute the scalar result in a vector (v_out2)
4695         step 2: extract the scalar result (s_out3) from the vector (v_out2)
4696         step 3: adjust the scalar result (s_out3) if needed.
4697
4698         Step 1 can be accomplished using one the following three schemes:
4699           (scheme 1) using reduc_fn, if available.
4700           (scheme 2) using whole-vector shifts, if available.
4701           (scheme 3) using a scalar loop. In this case steps 1+2 above are
4702                      combined.
4703
4704           The overall epilog code looks like this:
4705
4706           s_out0 = phi <s_loop>         # original EXIT_PHI
4707           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
4708           v_out2 = reduce <v_out1>              # step 1
4709           s_out3 = extract_field <v_out2, 0>    # step 2
4710           s_out4 = adjust_result <s_out3>       # step 3
4711
4712           (step 3 is optional, and steps 1 and 2 may be combined).
4713           Lastly, the uses of s_out0 are replaced by s_out4.  */
4714
4715
4716   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4717          v_out1 = phi <VECT_DEF>
4718          Store them in NEW_PHIS.  */
4719
4720   exit_bb = single_exit (loop)->dest;
4721   prev_phi_info = NULL;
4722   new_phis.create (vect_defs.length ());
4723   FOR_EACH_VEC_ELT (vect_defs, i, def)
4724     {
4725       for (j = 0; j < ncopies; j++)
4726         {
4727           tree new_def = copy_ssa_name (def);
4728           phi = create_phi_node (new_def, exit_bb);
4729           stmt_vec_info phi_info = loop_vinfo->add_stmt (phi);
4730           if (j == 0)
4731             new_phis.quick_push (phi);
4732           else
4733             {
4734               def = vect_get_vec_def_for_stmt_copy (loop_vinfo, def);
4735               STMT_VINFO_RELATED_STMT (prev_phi_info) = phi_info;
4736             }
4737
4738           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4739           prev_phi_info = phi_info;
4740         }
4741     }
4742
4743   /* The epilogue is created for the outer-loop, i.e., for the loop being
4744      vectorized.  Create exit phis for the outer loop.  */
4745   if (double_reduc)
4746     {
4747       loop = outer_loop;
4748       exit_bb = single_exit (loop)->dest;
4749       inner_phis.create (vect_defs.length ());
4750       FOR_EACH_VEC_ELT (new_phis, i, phi)
4751         {
4752           stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
4753           tree new_result = copy_ssa_name (PHI_RESULT (phi));
4754           gphi *outer_phi = create_phi_node (new_result, exit_bb);
4755           SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4756                            PHI_RESULT (phi));
4757           prev_phi_info = loop_vinfo->add_stmt (outer_phi);
4758           inner_phis.quick_push (phi_info);
4759           new_phis[i] = outer_phi;
4760           while (STMT_VINFO_RELATED_STMT (phi_info))
4761             {
4762               phi_info = STMT_VINFO_RELATED_STMT (phi_info);
4763               new_result = copy_ssa_name (PHI_RESULT (phi_info->stmt));
4764               outer_phi = create_phi_node (new_result, exit_bb);
4765               SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4766                                PHI_RESULT (phi_info->stmt));
4767               stmt_vec_info outer_phi_info = loop_vinfo->add_stmt (outer_phi);
4768               STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi_info;
4769               prev_phi_info = outer_phi_info;
4770             }
4771         }
4772     }
4773
4774   exit_gsi = gsi_after_labels (exit_bb);
4775
4776   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4777          (i.e. when reduc_fn is not available) and in the final adjustment
4778          code (if needed).  Also get the original scalar reduction variable as
4779          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
4780          represents a reduction pattern), the tree-code and scalar-def are
4781          taken from the original stmt that the pattern-stmt (STMT) replaces.
4782          Otherwise (it is a regular reduction) - the tree-code and scalar-def
4783          are taken from STMT.  */
4784
4785   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4786   if (orig_stmt_info != stmt_info)
4787     {
4788       /* Reduction pattern  */
4789       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4790       gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
4791     }
4792
4793   code = gimple_assign_rhs_code (orig_stmt_info->stmt);
4794   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4795      partial results are added and not subtracted.  */
4796   if (code == MINUS_EXPR)
4797     code = PLUS_EXPR;
4798
4799   scalar_dest = gimple_assign_lhs (orig_stmt_info->stmt);
4800   scalar_type = TREE_TYPE (scalar_dest);
4801   scalar_results.create (group_size);
4802   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4803   bitsize = TYPE_SIZE (scalar_type);
4804
4805   /* In case this is a reduction in an inner-loop while vectorizing an outer
4806      loop - we don't need to extract a single scalar result at the end of the
4807      inner-loop (unless it is double reduction, i.e., the use of reduction is
4808      outside the outer-loop).  The final vector of partial results will be used
4809      in the vectorized outer-loop, or reduced to a scalar result at the end of
4810      the outer-loop.  */
4811   if (nested_in_vect_loop && !double_reduc)
4812     goto vect_finalize_reduction;
4813
4814   /* SLP reduction without reduction chain, e.g.,
4815      # a1 = phi <a2, a0>
4816      # b1 = phi <b2, b0>
4817      a2 = operation (a1)
4818      b2 = operation (b1)  */
4819   slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
4820
4821   /* True if we should implement SLP_REDUC using native reduction operations
4822      instead of scalar operations.  */
4823   direct_slp_reduc = (reduc_fn != IFN_LAST
4824                       && slp_reduc
4825                       && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
4826
4827   /* In case of reduction chain, e.g.,
4828      # a1 = phi <a3, a0>
4829      a2 = operation (a1)
4830      a3 = operation (a2),
4831
4832      we may end up with more than one vector result.  Here we reduce them to
4833      one vector.  */
4834   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) || direct_slp_reduc)
4835     {
4836       tree first_vect = PHI_RESULT (new_phis[0]);
4837       gassign *new_vec_stmt = NULL;
4838       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4839       for (k = 1; k < new_phis.length (); k++)
4840         {
4841           gimple *next_phi = new_phis[k];
4842           tree second_vect = PHI_RESULT (next_phi);
4843           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4844           new_vec_stmt = gimple_build_assign (tem, code,
4845                                               first_vect, second_vect);
4846           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4847           first_vect = tem;
4848         }
4849
4850       new_phi_result = first_vect;
4851       if (new_vec_stmt)
4852         {
4853           new_phis.truncate (0);
4854           new_phis.safe_push (new_vec_stmt);
4855         }
4856     }
4857   /* Likewise if we couldn't use a single defuse cycle.  */
4858   else if (ncopies > 1)
4859     {
4860       gcc_assert (new_phis.length () == 1);
4861       tree first_vect = PHI_RESULT (new_phis[0]);
4862       gassign *new_vec_stmt = NULL;
4863       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4864       stmt_vec_info next_phi_info = loop_vinfo->lookup_stmt (new_phis[0]);
4865       for (int k = 1; k < ncopies; ++k)
4866         {
4867           next_phi_info = STMT_VINFO_RELATED_STMT (next_phi_info);
4868           tree second_vect = PHI_RESULT (next_phi_info->stmt);
4869           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4870           new_vec_stmt = gimple_build_assign (tem, code,
4871                                               first_vect, second_vect);
4872           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4873           first_vect = tem;
4874         }
4875       new_phi_result = first_vect;
4876       new_phis.truncate (0);
4877       new_phis.safe_push (new_vec_stmt);
4878     }
4879   else
4880     new_phi_result = PHI_RESULT (new_phis[0]);
4881
4882   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4883       && reduc_fn != IFN_LAST)
4884     {
4885       /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4886          various data values where the condition matched and another vector
4887          (INDUCTION_INDEX) containing all the indexes of those matches.  We
4888          need to extract the last matching index (which will be the index with
4889          highest value) and use this to index into the data vector.
4890          For the case where there were no matches, the data vector will contain
4891          all default values and the index vector will be all zeros.  */
4892
4893       /* Get various versions of the type of the vector of indexes.  */
4894       tree index_vec_type = TREE_TYPE (induction_index);
4895       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4896       tree index_scalar_type = TREE_TYPE (index_vec_type);
4897       tree index_vec_cmp_type = build_same_sized_truth_vector_type
4898         (index_vec_type);
4899
4900       /* Get an unsigned integer version of the type of the data vector.  */
4901       int scalar_precision
4902         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
4903       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4904       tree vectype_unsigned = build_vector_type
4905         (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4906
4907       /* First we need to create a vector (ZERO_VEC) of zeros and another
4908          vector (MAX_INDEX_VEC) filled with the last matching index, which we
4909          can create using a MAX reduction and then expanding.
4910          In the case where the loop never made any matches, the max index will
4911          be zero.  */
4912
4913       /* Vector of {0, 0, 0,...}.  */
4914       tree zero_vec = make_ssa_name (vectype);
4915       tree zero_vec_rhs = build_zero_cst (vectype);
4916       gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
4917       gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
4918
4919       /* Find maximum value from the vector of found indexes.  */
4920       tree max_index = make_ssa_name (index_scalar_type);
4921       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4922                                                           1, induction_index);
4923       gimple_call_set_lhs (max_index_stmt, max_index);
4924       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
4925
4926       /* Vector of {max_index, max_index, max_index,...}.  */
4927       tree max_index_vec = make_ssa_name (index_vec_type);
4928       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
4929                                                       max_index);
4930       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
4931                                                         max_index_vec_rhs);
4932       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
4933
4934       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
4935          with the vector (INDUCTION_INDEX) of found indexes, choosing values
4936          from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
4937          otherwise.  Only one value should match, resulting in a vector
4938          (VEC_COND) with one data value and the rest zeros.
4939          In the case where the loop never made any matches, every index will
4940          match, resulting in a vector with all data values (which will all be
4941          the default value).  */
4942
4943       /* Compare the max index vector to the vector of found indexes to find
4944          the position of the max value.  */
4945       tree vec_compare = make_ssa_name (index_vec_cmp_type);
4946       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
4947                                                       induction_index,
4948                                                       max_index_vec);
4949       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
4950
4951       /* Use the compare to choose either values from the data vector or
4952          zero.  */
4953       tree vec_cond = make_ssa_name (vectype);
4954       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
4955                                                    vec_compare, new_phi_result,
4956                                                    zero_vec);
4957       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
4958
4959       /* Finally we need to extract the data value from the vector (VEC_COND)
4960          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
4961          reduction, but because this doesn't exist, we can use a MAX reduction
4962          instead.  The data value might be signed or a float so we need to cast
4963          it first.
4964          In the case where the loop never made any matches, the data values are
4965          all identical, and so will reduce down correctly.  */
4966
4967       /* Make the matched data values unsigned.  */
4968       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
4969       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
4970                                        vec_cond);
4971       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
4972                                                         VIEW_CONVERT_EXPR,
4973                                                         vec_cond_cast_rhs);
4974       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
4975
4976       /* Reduce down to a scalar value.  */
4977       tree data_reduc = make_ssa_name (scalar_type_unsigned);
4978       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4979                                                            1, vec_cond_cast);
4980       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
4981       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
4982
4983       /* Convert the reduced value back to the result type and set as the
4984          result.  */
4985       gimple_seq stmts = NULL;
4986       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
4987                                data_reduc);
4988       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4989       scalar_results.safe_push (new_temp);
4990     }
4991   else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4992            && reduc_fn == IFN_LAST)
4993     {
4994       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
4995          idx = 0;
4996          idx_val = induction_index[0];
4997          val = data_reduc[0];
4998          for (idx = 0, val = init, i = 0; i < nelts; ++i)
4999            if (induction_index[i] > idx_val)
5000              val = data_reduc[i], idx_val = induction_index[i];
5001          return val;  */
5002
5003       tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
5004       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5005       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5006       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5007       /* Enforced by vectorizable_reduction, which ensures we have target
5008          support before allowing a conditional reduction on variable-length
5009          vectors.  */
5010       unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5011       tree idx_val = NULL_TREE, val = NULL_TREE;
5012       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5013         {
5014           tree old_idx_val = idx_val;
5015           tree old_val = val;
5016           idx_val = make_ssa_name (idx_eltype);
5017           epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5018                                              build3 (BIT_FIELD_REF, idx_eltype,
5019                                                      induction_index,
5020                                                      bitsize_int (el_size),
5021                                                      bitsize_int (off)));
5022           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5023           val = make_ssa_name (data_eltype);
5024           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5025                                              build3 (BIT_FIELD_REF,
5026                                                      data_eltype,
5027                                                      new_phi_result,
5028                                                      bitsize_int (el_size),
5029                                                      bitsize_int (off)));
5030           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5031           if (off != 0)
5032             {
5033               tree new_idx_val = idx_val;
5034               tree new_val = val;
5035               if (off != v_size - el_size)
5036                 {
5037                   new_idx_val = make_ssa_name (idx_eltype);
5038                   epilog_stmt = gimple_build_assign (new_idx_val,
5039                                                      MAX_EXPR, idx_val,
5040                                                      old_idx_val);
5041                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5042                 }
5043               new_val = make_ssa_name (data_eltype);
5044               epilog_stmt = gimple_build_assign (new_val,
5045                                                  COND_EXPR,
5046                                                  build2 (GT_EXPR,
5047                                                          boolean_type_node,
5048                                                          idx_val,
5049                                                          old_idx_val),
5050                                                  val, old_val);
5051               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5052               idx_val = new_idx_val;
5053               val = new_val;
5054             }
5055         }
5056       /* Convert the reduced value back to the result type and set as the
5057          result.  */
5058       gimple_seq stmts = NULL;
5059       val = gimple_convert (&stmts, scalar_type, val);
5060       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5061       scalar_results.safe_push (val);
5062     }
5063
5064   /* 2.3 Create the reduction code, using one of the three schemes described
5065          above. In SLP we simply need to extract all the elements from the
5066          vector (without reducing them), so we use scalar shifts.  */
5067   else if (reduc_fn != IFN_LAST && !slp_reduc)
5068     {
5069       tree tmp;
5070       tree vec_elem_type;
5071
5072       /* Case 1:  Create:
5073          v_out2 = reduc_expr <v_out1>  */
5074
5075       if (dump_enabled_p ())
5076         dump_printf_loc (MSG_NOTE, vect_location,
5077                          "Reduce using direct vector reduction.\n");
5078
5079       vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
5080       if (!useless_type_conversion_p (scalar_type, vec_elem_type))
5081         {
5082           tree tmp_dest
5083             = vect_create_destination_var (scalar_dest, vec_elem_type);
5084           epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5085                                                     new_phi_result);
5086           gimple_set_lhs (epilog_stmt, tmp_dest);
5087           new_temp = make_ssa_name (tmp_dest, epilog_stmt);
5088           gimple_set_lhs (epilog_stmt, new_temp);
5089           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5090
5091           epilog_stmt = gimple_build_assign (new_scalar_dest, NOP_EXPR,
5092                                              new_temp);
5093         }
5094       else
5095         {
5096           epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5097                                                     new_phi_result);
5098           gimple_set_lhs (epilog_stmt, new_scalar_dest);
5099         }
5100
5101       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5102       gimple_set_lhs (epilog_stmt, new_temp);
5103       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5104
5105       if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5106            == INTEGER_INDUC_COND_REDUCTION)
5107           && !operand_equal_p (initial_def, induc_val, 0))
5108         {
5109           /* Earlier we set the initial value to be a vector if induc_val
5110              values.  Check the result and if it is induc_val then replace
5111              with the original initial value, unless induc_val is
5112              the same as initial_def already.  */
5113           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5114                                   induc_val);
5115
5116           tmp = make_ssa_name (new_scalar_dest);
5117           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5118                                              initial_def, new_temp);
5119           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5120           new_temp = tmp;
5121         }
5122
5123       scalar_results.safe_push (new_temp);
5124     }
5125   else if (direct_slp_reduc)
5126     {
5127       /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5128          with the elements for other SLP statements replaced with the
5129          neutral value.  We can then do a normal reduction on each vector.  */
5130
5131       /* Enforced by vectorizable_reduction.  */
5132       gcc_assert (new_phis.length () == 1);
5133       gcc_assert (pow2p_hwi (group_size));
5134
5135       slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
5136       vec<stmt_vec_info> orig_phis
5137         = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
5138       gimple_seq seq = NULL;
5139
5140       /* Build a vector {0, 1, 2, ...}, with the same number of elements
5141          and the same element size as VECTYPE.  */
5142       tree index = build_index_vector (vectype, 0, 1);
5143       tree index_type = TREE_TYPE (index);
5144       tree index_elt_type = TREE_TYPE (index_type);
5145       tree mask_type = build_same_sized_truth_vector_type (index_type);
5146
5147       /* Create a vector that, for each element, identifies which of
5148          the REDUC_GROUP_SIZE results should use it.  */
5149       tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5150       index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5151                             build_vector_from_val (index_type, index_mask));
5152
5153       /* Get a neutral vector value.  This is simply a splat of the neutral
5154          scalar value if we have one, otherwise the initial scalar value
5155          is itself a neutral value.  */
5156       tree vector_identity = NULL_TREE;
5157       if (neutral_op)
5158         vector_identity = gimple_build_vector_from_val (&seq, vectype,
5159                                                         neutral_op);
5160       for (unsigned int i = 0; i < group_size; ++i)
5161         {
5162           /* If there's no univeral neutral value, we can use the
5163              initial scalar value from the original PHI.  This is used
5164              for MIN and MAX reduction, for example.  */
5165           if (!neutral_op)
5166             {
5167               tree scalar_value
5168                 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i]->stmt,
5169                                          loop_preheader_edge (loop));
5170               vector_identity = gimple_build_vector_from_val (&seq, vectype,
5171                                                               scalar_value);
5172             }
5173
5174           /* Calculate the equivalent of:
5175
5176              sel[j] = (index[j] == i);
5177
5178              which selects the elements of NEW_PHI_RESULT that should
5179              be included in the result.  */
5180           tree compare_val = build_int_cst (index_elt_type, i);
5181           compare_val = build_vector_from_val (index_type, compare_val);
5182           tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5183                                    index, compare_val);
5184
5185           /* Calculate the equivalent of:
5186
5187              vec = seq ? new_phi_result : vector_identity;
5188
5189              VEC is now suitable for a full vector reduction.  */
5190           tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5191                                    sel, new_phi_result, vector_identity);
5192
5193           /* Do the reduction and convert it to the appropriate type.  */
5194           tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5195                                       TREE_TYPE (vectype), vec);
5196           scalar = gimple_convert (&seq, scalar_type, scalar);
5197           scalar_results.safe_push (scalar);
5198         }
5199       gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5200     }
5201   else
5202     {
5203       bool reduce_with_shift;
5204       tree vec_temp;
5205
5206       /* COND reductions all do the final reduction with MAX_EXPR
5207          or MIN_EXPR.  */
5208       if (code == COND_EXPR)
5209         {
5210           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5211               == INTEGER_INDUC_COND_REDUCTION)
5212             code = induc_code;
5213           else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5214                    == CONST_COND_REDUCTION)
5215             code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
5216           else
5217             code = MAX_EXPR;
5218         }
5219
5220       /* See if the target wants to do the final (shift) reduction
5221          in a vector mode of smaller size and first reduce upper/lower
5222          halves against each other.  */
5223       enum machine_mode mode1 = mode;
5224       tree vectype1 = vectype;
5225       unsigned sz = tree_to_uhwi (TYPE_SIZE_UNIT (vectype));
5226       unsigned sz1 = sz;
5227       if (!slp_reduc
5228           && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5229         sz1 = GET_MODE_SIZE (mode1).to_constant ();
5230
5231       vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz1);
5232       reduce_with_shift = have_whole_vector_shift (mode1);
5233       if (!VECTOR_MODE_P (mode1))
5234         reduce_with_shift = false;
5235       else
5236         {
5237           optab optab = optab_for_tree_code (code, vectype1, optab_default);
5238           if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5239             reduce_with_shift = false;
5240         }
5241
5242       /* First reduce the vector to the desired vector size we should
5243          do shift reduction on by combining upper and lower halves.  */
5244       new_temp = new_phi_result;
5245       while (sz > sz1)
5246         {
5247           gcc_assert (!slp_reduc);
5248           sz /= 2;
5249           vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz);
5250
5251           /* The target has to make sure we support lowpart/highpart
5252              extraction, either via direct vector extract or through
5253              an integer mode punning.  */
5254           tree dst1, dst2;
5255           if (convert_optab_handler (vec_extract_optab,
5256                                      TYPE_MODE (TREE_TYPE (new_temp)),
5257                                      TYPE_MODE (vectype1))
5258               != CODE_FOR_nothing)
5259             {
5260               /* Extract sub-vectors directly once vec_extract becomes
5261                  a conversion optab.  */
5262               dst1 = make_ssa_name (vectype1);
5263               epilog_stmt
5264                   = gimple_build_assign (dst1, BIT_FIELD_REF,
5265                                          build3 (BIT_FIELD_REF, vectype1,
5266                                                  new_temp, TYPE_SIZE (vectype1),
5267                                                  bitsize_int (0)));
5268               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5269               dst2 =  make_ssa_name (vectype1);
5270               epilog_stmt
5271                   = gimple_build_assign (dst2, BIT_FIELD_REF,
5272                                          build3 (BIT_FIELD_REF, vectype1,
5273                                                  new_temp, TYPE_SIZE (vectype1),
5274                                                  bitsize_int (sz * BITS_PER_UNIT)));
5275               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5276             }
5277           else
5278             {
5279               /* Extract via punning to appropriately sized integer mode
5280                  vector.  */
5281               tree eltype = build_nonstandard_integer_type (sz * BITS_PER_UNIT,
5282                                                             1);
5283               tree etype = build_vector_type (eltype, 2);
5284               gcc_assert (convert_optab_handler (vec_extract_optab,
5285                                                  TYPE_MODE (etype),
5286                                                  TYPE_MODE (eltype))
5287                           != CODE_FOR_nothing);
5288               tree tem = make_ssa_name (etype);
5289               epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5290                                                  build1 (VIEW_CONVERT_EXPR,
5291                                                          etype, new_temp));
5292               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5293               new_temp = tem;
5294               tem = make_ssa_name (eltype);
5295               epilog_stmt
5296                   = gimple_build_assign (tem, BIT_FIELD_REF,
5297                                          build3 (BIT_FIELD_REF, eltype,
5298                                                  new_temp, TYPE_SIZE (eltype),
5299                                                  bitsize_int (0)));
5300               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5301               dst1 = make_ssa_name (vectype1);
5302               epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5303                                                  build1 (VIEW_CONVERT_EXPR,
5304                                                          vectype1, tem));
5305               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5306               tem = make_ssa_name (eltype);
5307               epilog_stmt
5308                   = gimple_build_assign (tem, BIT_FIELD_REF,
5309                                          build3 (BIT_FIELD_REF, eltype,
5310                                                  new_temp, TYPE_SIZE (eltype),
5311                                                  bitsize_int (sz * BITS_PER_UNIT)));
5312               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5313               dst2 =  make_ssa_name (vectype1);
5314               epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5315                                                  build1 (VIEW_CONVERT_EXPR,
5316                                                          vectype1, tem));
5317               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5318             }
5319
5320           new_temp = make_ssa_name (vectype1);
5321           epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5322           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5323         }
5324
5325       if (reduce_with_shift && !slp_reduc)
5326         {
5327           int element_bitsize = tree_to_uhwi (bitsize);
5328           /* Enforced by vectorizable_reduction, which disallows SLP reductions
5329              for variable-length vectors and also requires direct target support
5330              for loop reductions.  */
5331           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5332           int nelements = vec_size_in_bits / element_bitsize;
5333           vec_perm_builder sel;
5334           vec_perm_indices indices;
5335
5336           int elt_offset;
5337
5338           tree zero_vec = build_zero_cst (vectype1);
5339           /* Case 2: Create:
5340              for (offset = nelements/2; offset >= 1; offset/=2)
5341                 {
5342                   Create:  va' = vec_shift <va, offset>
5343                   Create:  va = vop <va, va'>
5344                 }  */
5345
5346           tree rhs;
5347
5348           if (dump_enabled_p ())
5349             dump_printf_loc (MSG_NOTE, vect_location,
5350                              "Reduce using vector shifts\n");
5351
5352           mode1 = TYPE_MODE (vectype1);
5353           vec_dest = vect_create_destination_var (scalar_dest, vectype1);
5354           for (elt_offset = nelements / 2;
5355                elt_offset >= 1;
5356                elt_offset /= 2)
5357             {
5358               calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5359               indices.new_vector (sel, 2, nelements);
5360               tree mask = vect_gen_perm_mask_any (vectype1, indices);
5361               epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
5362                                                  new_temp, zero_vec, mask);
5363               new_name = make_ssa_name (vec_dest, epilog_stmt);
5364               gimple_assign_set_lhs (epilog_stmt, new_name);
5365               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5366
5367               epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
5368                                                  new_temp);
5369               new_temp = make_ssa_name (vec_dest, epilog_stmt);
5370               gimple_assign_set_lhs (epilog_stmt, new_temp);
5371               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5372             }
5373
5374           /* 2.4  Extract the final scalar result.  Create:
5375              s_out3 = extract_field <v_out2, bitpos>  */
5376
5377           if (dump_enabled_p ())
5378             dump_printf_loc (MSG_NOTE, vect_location,
5379                              "extract scalar result\n");
5380
5381           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5382                         bitsize, bitsize_zero_node);
5383           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5384           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5385           gimple_assign_set_lhs (epilog_stmt, new_temp);
5386           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5387           scalar_results.safe_push (new_temp);
5388         }
5389       else
5390         {
5391           /* Case 3: Create:
5392              s = extract_field <v_out2, 0>
5393              for (offset = element_size;
5394                   offset < vector_size;
5395                   offset += element_size;)
5396                {
5397                  Create:  s' = extract_field <v_out2, offset>
5398                  Create:  s = op <s, s'>  // For non SLP cases
5399                }  */
5400
5401           if (dump_enabled_p ())
5402             dump_printf_loc (MSG_NOTE, vect_location,
5403                              "Reduce using scalar code.\n");
5404
5405           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5406           int element_bitsize = tree_to_uhwi (bitsize);
5407           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5408             {
5409               int bit_offset;
5410               if (gimple_code (new_phi) == GIMPLE_PHI)
5411                 vec_temp = PHI_RESULT (new_phi);
5412               else
5413                 vec_temp = gimple_assign_lhs (new_phi);
5414               tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
5415                                  bitsize_zero_node);
5416               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5417               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5418               gimple_assign_set_lhs (epilog_stmt, new_temp);
5419               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5420
5421               /* In SLP we don't need to apply reduction operation, so we just
5422                  collect s' values in SCALAR_RESULTS.  */
5423               if (slp_reduc)
5424                 scalar_results.safe_push (new_temp);
5425
5426               for (bit_offset = element_bitsize;
5427                    bit_offset < vec_size_in_bits;
5428                    bit_offset += element_bitsize)
5429                 {
5430                   tree bitpos = bitsize_int (bit_offset);
5431                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
5432                                      bitsize, bitpos);
5433
5434                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5435                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
5436                   gimple_assign_set_lhs (epilog_stmt, new_name);
5437                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5438
5439                   if (slp_reduc)
5440                     {
5441                       /* In SLP we don't need to apply reduction operation, so
5442                          we just collect s' values in SCALAR_RESULTS.  */
5443                       new_temp = new_name;
5444                       scalar_results.safe_push (new_name);
5445                     }
5446                   else
5447                     {
5448                       epilog_stmt = gimple_build_assign (new_scalar_dest, code,
5449                                                          new_name, new_temp);
5450                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5451                       gimple_assign_set_lhs (epilog_stmt, new_temp);
5452                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5453                     }
5454                 }
5455             }
5456
5457           /* The only case where we need to reduce scalar results in SLP, is
5458              unrolling.  If the size of SCALAR_RESULTS is greater than
5459              REDUC_GROUP_SIZE, we reduce them combining elements modulo
5460              REDUC_GROUP_SIZE.  */
5461           if (slp_reduc)
5462             {
5463               tree res, first_res, new_res;
5464               gimple *new_stmt;
5465
5466               /* Reduce multiple scalar results in case of SLP unrolling.  */
5467               for (j = group_size; scalar_results.iterate (j, &res);
5468                    j++)
5469                 {
5470                   first_res = scalar_results[j % group_size];
5471                   new_stmt = gimple_build_assign (new_scalar_dest, code,
5472                                                   first_res, res);
5473                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
5474                   gimple_assign_set_lhs (new_stmt, new_res);
5475                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
5476                   scalar_results[j % group_size] = new_res;
5477                 }
5478             }
5479           else
5480             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
5481             scalar_results.safe_push (new_temp);
5482         }
5483
5484       if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5485            == INTEGER_INDUC_COND_REDUCTION)
5486           && !operand_equal_p (initial_def, induc_val, 0))
5487         {
5488           /* Earlier we set the initial value to be a vector if induc_val
5489              values.  Check the result and if it is induc_val then replace
5490              with the original initial value, unless induc_val is
5491              the same as initial_def already.  */
5492           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5493                                   induc_val);
5494
5495           tree tmp = make_ssa_name (new_scalar_dest);
5496           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5497                                              initial_def, new_temp);
5498           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5499           scalar_results[0] = tmp;
5500         }
5501     }
5502
5503 vect_finalize_reduction:
5504
5505   if (double_reduc)
5506     loop = loop->inner;
5507
5508   /* 2.5 Adjust the final result by the initial value of the reduction
5509          variable. (When such adjustment is not needed, then
5510          'adjustment_def' is zero).  For example, if code is PLUS we create:
5511          new_temp = loop_exit_def + adjustment_def  */
5512
5513   if (adjustment_def)
5514     {
5515       gcc_assert (!slp_reduc);
5516       if (nested_in_vect_loop)
5517         {
5518           new_phi = new_phis[0];
5519           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
5520           expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
5521           new_dest = vect_create_destination_var (scalar_dest, vectype);
5522         }
5523       else
5524         {
5525           new_temp = scalar_results[0];
5526           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5527           expr = build2 (code, scalar_type, new_temp, adjustment_def);
5528           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
5529         }
5530
5531       epilog_stmt = gimple_build_assign (new_dest, expr);
5532       new_temp = make_ssa_name (new_dest, epilog_stmt);
5533       gimple_assign_set_lhs (epilog_stmt, new_temp);
5534       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5535       if (nested_in_vect_loop)
5536         {
5537           stmt_vec_info epilog_stmt_info = loop_vinfo->add_stmt (epilog_stmt);
5538           STMT_VINFO_RELATED_STMT (epilog_stmt_info)
5539             = STMT_VINFO_RELATED_STMT (loop_vinfo->lookup_stmt (new_phi));
5540
5541           if (!double_reduc)
5542             scalar_results.quick_push (new_temp);
5543           else
5544             scalar_results[0] = new_temp;
5545         }
5546       else
5547         scalar_results[0] = new_temp;
5548
5549       new_phis[0] = epilog_stmt;
5550     }
5551
5552   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
5553           phis with new adjusted scalar results, i.e., replace use <s_out0>
5554           with use <s_out4>.
5555
5556      Transform:
5557         loop_exit:
5558           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5559           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5560           v_out2 = reduce <v_out1>
5561           s_out3 = extract_field <v_out2, 0>
5562           s_out4 = adjust_result <s_out3>
5563           use <s_out0>
5564           use <s_out0>
5565
5566      into:
5567
5568         loop_exit:
5569           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5570           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5571           v_out2 = reduce <v_out1>
5572           s_out3 = extract_field <v_out2, 0>
5573           s_out4 = adjust_result <s_out3>
5574           use <s_out4>
5575           use <s_out4> */
5576
5577
5578   /* In SLP reduction chain we reduce vector results into one vector if
5579      necessary, hence we set here REDUC_GROUP_SIZE to 1.  SCALAR_DEST is the
5580      LHS of the last stmt in the reduction chain, since we are looking for
5581      the loop exit phi node.  */
5582   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5583     {
5584       stmt_vec_info dest_stmt_info
5585         = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
5586       scalar_dest = gimple_assign_lhs (dest_stmt_info->stmt);
5587       group_size = 1;
5588     }
5589
5590   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5591      case that REDUC_GROUP_SIZE is greater than vectorization factor).
5592      Therefore, we need to match SCALAR_RESULTS with corresponding statements.
5593      The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results
5594      correspond to the first vector stmt, etc.
5595      (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)).  */
5596   if (group_size > new_phis.length ())
5597     {
5598       ratio = group_size / new_phis.length ();
5599       gcc_assert (!(group_size % new_phis.length ()));
5600     }
5601   else
5602     ratio = 1;
5603
5604   stmt_vec_info epilog_stmt_info = NULL;
5605   for (k = 0; k < group_size; k++)
5606     {
5607       if (k % ratio == 0)
5608         {
5609           epilog_stmt_info = loop_vinfo->lookup_stmt (new_phis[k / ratio]);
5610           reduction_phi_info = reduction_phis[k / ratio];
5611           if (double_reduc)
5612             inner_phi = inner_phis[k / ratio];
5613         }
5614
5615       if (slp_reduc)
5616         {
5617           stmt_vec_info scalar_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5618
5619           orig_stmt_info = STMT_VINFO_RELATED_STMT (scalar_stmt_info);
5620           /* SLP statements can't participate in patterns.  */
5621           gcc_assert (!orig_stmt_info);
5622           scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt);
5623         }
5624
5625       phis.create (3);
5626       /* Find the loop-closed-use at the loop exit of the original scalar
5627          result.  (The reduction result is expected to have two immediate uses -
5628          one at the latch block, and one at the loop exit).  */
5629       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5630         if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
5631             && !is_gimple_debug (USE_STMT (use_p)))
5632           phis.safe_push (USE_STMT (use_p));
5633
5634       /* While we expect to have found an exit_phi because of loop-closed-ssa
5635          form we can end up without one if the scalar cycle is dead.  */
5636
5637       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5638         {
5639           if (outer_loop)
5640             {
5641               stmt_vec_info exit_phi_vinfo
5642                 = loop_vinfo->lookup_stmt (exit_phi);
5643               gphi *vect_phi;
5644
5645               if (double_reduc)
5646                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
5647               else
5648                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt_info;
5649               if (!double_reduc
5650                   || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
5651                       != vect_double_reduction_def)
5652                 continue;
5653
5654               /* Handle double reduction:
5655
5656                  stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
5657                  stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
5658                  stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
5659                  stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
5660
5661                  At that point the regular reduction (stmt2 and stmt3) is
5662                  already vectorized, as well as the exit phi node, stmt4.
5663                  Here we vectorize the phi node of double reduction, stmt1, and
5664                  update all relevant statements.  */
5665
5666               /* Go through all the uses of s2 to find double reduction phi
5667                  node, i.e., stmt1 above.  */
5668               orig_name = PHI_RESULT (exit_phi);
5669               FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5670                 {
5671                   stmt_vec_info use_stmt_vinfo;
5672                   tree vect_phi_init, preheader_arg, vect_phi_res;
5673                   basic_block bb = gimple_bb (use_stmt);
5674
5675                   /* Check that USE_STMT is really double reduction phi
5676                      node.  */
5677                   if (gimple_code (use_stmt) != GIMPLE_PHI
5678                       || gimple_phi_num_args (use_stmt) != 2
5679                       || bb->loop_father != outer_loop)
5680                     continue;
5681                   use_stmt_vinfo = loop_vinfo->lookup_stmt (use_stmt);
5682                   if (!use_stmt_vinfo
5683                       || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
5684                           != vect_double_reduction_def)
5685                     continue;
5686
5687                   /* Create vector phi node for double reduction:
5688                      vs1 = phi <vs0, vs2>
5689                      vs1 was created previously in this function by a call to
5690                        vect_get_vec_def_for_operand and is stored in
5691                        vec_initial_def;
5692                      vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
5693                      vs0 is created here.  */
5694
5695                   /* Create vector phi node.  */
5696                   vect_phi = create_phi_node (vec_initial_def, bb);
5697                   loop_vec_info_for_loop (outer_loop)->add_stmt (vect_phi);
5698
5699                   /* Create vs0 - initial def of the double reduction phi.  */
5700                   preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
5701                                              loop_preheader_edge (outer_loop));
5702                   vect_phi_init = get_initial_def_for_reduction
5703                     (stmt_info, preheader_arg, NULL);
5704
5705                   /* Update phi node arguments with vs0 and vs2.  */
5706                   add_phi_arg (vect_phi, vect_phi_init,
5707                                loop_preheader_edge (outer_loop),
5708                                UNKNOWN_LOCATION);
5709                   add_phi_arg (vect_phi, PHI_RESULT (inner_phi->stmt),
5710                                loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
5711                   if (dump_enabled_p ())
5712                     dump_printf_loc (MSG_NOTE, vect_location,
5713                                      "created double reduction phi node: %G",
5714                                      vect_phi);
5715
5716                   vect_phi_res = PHI_RESULT (vect_phi);
5717
5718                   /* Replace the use, i.e., set the correct vs1 in the regular
5719                      reduction phi node.  FORNOW, NCOPIES is always 1, so the
5720                      loop is redundant.  */
5721                   stmt_vec_info use_info = reduction_phi_info;
5722                   for (j = 0; j < ncopies; j++)
5723                     {
5724                       edge pr_edge = loop_preheader_edge (loop);
5725                       SET_PHI_ARG_DEF (as_a <gphi *> (use_info->stmt),
5726                                        pr_edge->dest_idx, vect_phi_res);
5727                       use_info = STMT_VINFO_RELATED_STMT (use_info);
5728                     }
5729                 }
5730             }
5731         }
5732
5733       phis.release ();
5734       if (nested_in_vect_loop)
5735         {
5736           if (double_reduc)
5737             loop = outer_loop;
5738           else
5739             continue;
5740         }
5741
5742       phis.create (3);
5743       /* Find the loop-closed-use at the loop exit of the original scalar
5744          result.  (The reduction result is expected to have two immediate uses,
5745          one at the latch block, and one at the loop exit).  For double
5746          reductions we are looking for exit phis of the outer loop.  */
5747       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5748         {
5749           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5750             {
5751               if (!is_gimple_debug (USE_STMT (use_p)))
5752                 phis.safe_push (USE_STMT (use_p));
5753             }
5754           else
5755             {
5756               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5757                 {
5758                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
5759
5760                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5761                     {
5762                       if (!flow_bb_inside_loop_p (loop,
5763                                              gimple_bb (USE_STMT (phi_use_p)))
5764                           && !is_gimple_debug (USE_STMT (phi_use_p)))
5765                         phis.safe_push (USE_STMT (phi_use_p));
5766                     }
5767                 }
5768             }
5769         }
5770
5771       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5772         {
5773           /* Replace the uses:  */
5774           orig_name = PHI_RESULT (exit_phi);
5775           scalar_result = scalar_results[k];
5776           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5777             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5778               SET_USE (use_p, scalar_result);
5779         }
5780
5781       phis.release ();
5782     }
5783 }
5784
5785 /* Return a vector of type VECTYPE that is equal to the vector select
5786    operation "MASK ? VEC : IDENTITY".  Insert the select statements
5787    before GSI.  */
5788
5789 static tree
5790 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
5791                      tree vec, tree identity)
5792 {
5793   tree cond = make_temp_ssa_name (vectype, NULL, "cond");
5794   gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
5795                                           mask, vec, identity);
5796   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5797   return cond;
5798 }
5799
5800 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
5801    order, starting with LHS.  Insert the extraction statements before GSI and
5802    associate the new scalar SSA names with variable SCALAR_DEST.
5803    Return the SSA name for the result.  */
5804
5805 static tree
5806 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
5807                        tree_code code, tree lhs, tree vector_rhs)
5808 {
5809   tree vectype = TREE_TYPE (vector_rhs);
5810   tree scalar_type = TREE_TYPE (vectype);
5811   tree bitsize = TYPE_SIZE (scalar_type);
5812   unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5813   unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
5814
5815   for (unsigned HOST_WIDE_INT bit_offset = 0;
5816        bit_offset < vec_size_in_bits;
5817        bit_offset += element_bitsize)
5818     {
5819       tree bitpos = bitsize_int (bit_offset);
5820       tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
5821                          bitsize, bitpos);
5822
5823       gassign *stmt = gimple_build_assign (scalar_dest, rhs);
5824       rhs = make_ssa_name (scalar_dest, stmt);
5825       gimple_assign_set_lhs (stmt, rhs);
5826       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5827
5828       stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
5829       tree new_name = make_ssa_name (scalar_dest, stmt);
5830       gimple_assign_set_lhs (stmt, new_name);
5831       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5832       lhs = new_name;
5833     }
5834   return lhs;
5835 }
5836
5837 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT_INFO is the
5838    statement that sets the live-out value.  REDUC_DEF_STMT is the phi
5839    statement.  CODE is the operation performed by STMT_INFO and OPS are
5840    its scalar operands.  REDUC_INDEX is the index of the operand in
5841    OPS that is set by REDUC_DEF_STMT.  REDUC_FN is the function that
5842    implements in-order reduction, or IFN_LAST if we should open-code it.
5843    VECTYPE_IN is the type of the vector input.  MASKS specifies the masks
5844    that should be used to control the operation in a fully-masked loop.  */
5845
5846 static bool
5847 vectorize_fold_left_reduction (stmt_vec_info stmt_info,
5848                                gimple_stmt_iterator *gsi,
5849                                stmt_vec_info *vec_stmt, slp_tree slp_node,
5850                                gimple *reduc_def_stmt,
5851                                tree_code code, internal_fn reduc_fn,
5852                                tree ops[3], tree vectype_in,
5853                                int reduc_index, vec_loop_masks *masks)
5854 {
5855   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5856   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5857   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5858   stmt_vec_info new_stmt_info = NULL;
5859
5860   int ncopies;
5861   if (slp_node)
5862     ncopies = 1;
5863   else
5864     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
5865
5866   gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
5867   gcc_assert (ncopies == 1);
5868   gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
5869   gcc_assert (reduc_index == (code == MINUS_EXPR ? 0 : 1));
5870   gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5871               == FOLD_LEFT_REDUCTION);
5872
5873   if (slp_node)
5874     gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
5875                           TYPE_VECTOR_SUBPARTS (vectype_in)));
5876
5877   tree op0 = ops[1 - reduc_index];
5878
5879   int group_size = 1;
5880   stmt_vec_info scalar_dest_def_info;
5881   auto_vec<tree> vec_oprnds0;
5882   if (slp_node)
5883     {
5884       auto_vec<vec<tree> > vec_defs (2);
5885       auto_vec<tree> sops(2);
5886       sops.quick_push (ops[0]);
5887       sops.quick_push (ops[1]);
5888       vect_get_slp_defs (sops, slp_node, &vec_defs);
5889       vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
5890       vec_defs[0].release ();
5891       vec_defs[1].release ();
5892       group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
5893       scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5894     }
5895   else
5896     {
5897       tree loop_vec_def0 = vect_get_vec_def_for_operand (op0, stmt_info);
5898       vec_oprnds0.create (1);
5899       vec_oprnds0.quick_push (loop_vec_def0);
5900       scalar_dest_def_info = stmt_info;
5901     }
5902
5903   tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
5904   tree scalar_type = TREE_TYPE (scalar_dest);
5905   tree reduc_var = gimple_phi_result (reduc_def_stmt);
5906
5907   int vec_num = vec_oprnds0.length ();
5908   gcc_assert (vec_num == 1 || slp_node);
5909   tree vec_elem_type = TREE_TYPE (vectype_out);
5910   gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
5911
5912   tree vector_identity = NULL_TREE;
5913   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5914     vector_identity = build_zero_cst (vectype_out);
5915
5916   tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
5917   int i;
5918   tree def0;
5919   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5920     {
5921       gimple *new_stmt;
5922       tree mask = NULL_TREE;
5923       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5924         mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
5925
5926       /* Handle MINUS by adding the negative.  */
5927       if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
5928         {
5929           tree negated = make_ssa_name (vectype_out);
5930           new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
5931           gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5932           def0 = negated;
5933         }
5934
5935       if (mask)
5936         def0 = merge_with_identity (gsi, mask, vectype_out, def0,
5937                                     vector_identity);
5938
5939       /* On the first iteration the input is simply the scalar phi
5940          result, and for subsequent iterations it is the output of
5941          the preceding operation.  */
5942       if (reduc_fn != IFN_LAST)
5943         {
5944           new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var, def0);
5945           /* For chained SLP reductions the output of the previous reduction
5946              operation serves as the input of the next. For the final statement
5947              the output cannot be a temporary - we reuse the original
5948              scalar destination of the last statement.  */
5949           if (i != vec_num - 1)
5950             {
5951               gimple_set_lhs (new_stmt, scalar_dest_var);
5952               reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
5953               gimple_set_lhs (new_stmt, reduc_var);
5954             }
5955         }
5956       else
5957         {
5958           reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
5959                                              reduc_var, def0);
5960           new_stmt = SSA_NAME_DEF_STMT (reduc_var);
5961           /* Remove the statement, so that we can use the same code paths
5962              as for statements that we've just created.  */
5963           gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
5964           gsi_remove (&tmp_gsi, true);
5965         }
5966
5967       if (i == vec_num - 1)
5968         {
5969           gimple_set_lhs (new_stmt, scalar_dest);
5970           new_stmt_info = vect_finish_replace_stmt (scalar_dest_def_info,
5971                                                     new_stmt);
5972         }
5973       else
5974         new_stmt_info = vect_finish_stmt_generation (scalar_dest_def_info,
5975                                                      new_stmt, gsi);
5976
5977       if (slp_node)
5978         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
5979     }
5980
5981   if (!slp_node)
5982     STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
5983
5984   return true;
5985 }
5986
5987 /* Function is_nonwrapping_integer_induction.
5988
5989    Check if STMT_VINO (which is part of loop LOOP) both increments and
5990    does not cause overflow.  */
5991
5992 static bool
5993 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, struct loop *loop)
5994 {
5995   gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
5996   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
5997   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
5998   tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
5999   widest_int ni, max_loop_value, lhs_max;
6000   wi::overflow_type overflow = wi::OVF_NONE;
6001
6002   /* Make sure the loop is integer based.  */
6003   if (TREE_CODE (base) != INTEGER_CST
6004       || TREE_CODE (step) != INTEGER_CST)
6005     return false;
6006
6007   /* Check that the max size of the loop will not wrap.  */
6008
6009   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6010     return true;
6011
6012   if (! max_stmt_executions (loop, &ni))
6013     return false;
6014
6015   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6016                             &overflow);
6017   if (overflow)
6018     return false;
6019
6020   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6021                             TYPE_SIGN (lhs_type), &overflow);
6022   if (overflow)
6023     return false;
6024
6025   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6026           <= TYPE_PRECISION (lhs_type));
6027 }
6028
6029 /* Check if masking can be supported by inserting a conditional expression.
6030    CODE is the code for the operation.  COND_FN is the conditional internal
6031    function, if it exists.  VECTYPE_IN is the type of the vector input.  */
6032 static bool
6033 use_mask_by_cond_expr_p (enum tree_code code, internal_fn cond_fn,
6034                          tree vectype_in)
6035 {
6036   if (cond_fn != IFN_LAST
6037       && direct_internal_fn_supported_p (cond_fn, vectype_in,
6038                                          OPTIMIZE_FOR_SPEED))
6039     return false;
6040
6041   switch (code)
6042     {
6043     case DOT_PROD_EXPR:
6044     case SAD_EXPR:
6045       return true;
6046
6047     default:
6048       return false;
6049     }
6050 }
6051
6052 /* Insert a conditional expression to enable masked vectorization.  CODE is the
6053    code for the operation.  VOP is the array of operands.  MASK is the loop
6054    mask.  GSI is a statement iterator used to place the new conditional
6055    expression.  */
6056 static void
6057 build_vect_cond_expr (enum tree_code code, tree vop[3], tree mask,
6058                       gimple_stmt_iterator *gsi)
6059 {
6060   switch (code)
6061     {
6062     case DOT_PROD_EXPR:
6063       {
6064         tree vectype = TREE_TYPE (vop[1]);
6065         tree zero = build_zero_cst (vectype);
6066         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6067         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6068                                                mask, vop[1], zero);
6069         gsi_insert_before (gsi, select, GSI_SAME_STMT);
6070         vop[1] = masked_op1;
6071         break;
6072       }
6073
6074     case SAD_EXPR:
6075       {
6076         tree vectype = TREE_TYPE (vop[1]);
6077         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6078         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6079                                                mask, vop[1], vop[0]);
6080         gsi_insert_before (gsi, select, GSI_SAME_STMT);
6081         vop[1] = masked_op1;
6082         break;
6083       }
6084
6085     default:
6086       gcc_unreachable ();
6087     }
6088 }
6089
6090 /* Function vectorizable_reduction.
6091
6092    Check if STMT_INFO performs a reduction operation that can be vectorized.
6093    If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
6094    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6095    Return true if STMT_INFO is vectorizable in this way.
6096
6097    This function also handles reduction idioms (patterns) that have been
6098    recognized in advance during vect_pattern_recog.  In this case, STMT_INFO
6099    may be of this form:
6100      X = pattern_expr (arg0, arg1, ..., X)
6101    and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
6102    sequence that had been detected and replaced by the pattern-stmt
6103    (STMT_INFO).
6104
6105    This function also handles reduction of condition expressions, for example:
6106      for (int i = 0; i < N; i++)
6107        if (a[i] < value)
6108          last = a[i];
6109    This is handled by vectorising the loop and creating an additional vector
6110    containing the loop indexes for which "a[i] < value" was true.  In the
6111    function epilogue this is reduced to a single max value and then used to
6112    index into the vector of results.
6113
6114    In some cases of reduction patterns, the type of the reduction variable X is
6115    different than the type of the other arguments of STMT_INFO.
6116    In such cases, the vectype that is used when transforming STMT_INFO into
6117    a vector stmt is different than the vectype that is used to determine the
6118    vectorization factor, because it consists of a different number of elements
6119    than the actual number of elements that are being operated upon in parallel.
6120
6121    For example, consider an accumulation of shorts into an int accumulator.
6122    On some targets it's possible to vectorize this pattern operating on 8
6123    shorts at a time (hence, the vectype for purposes of determining the
6124    vectorization factor should be V8HI); on the other hand, the vectype that
6125    is used to create the vector form is actually V4SI (the type of the result).
6126
6127    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6128    indicates what is the actual level of parallelism (V8HI in the example), so
6129    that the right vectorization factor would be derived.  This vectype
6130    corresponds to the type of arguments to the reduction stmt, and should *NOT*
6131    be used to create the vectorized stmt.  The right vectype for the vectorized
6132    stmt is obtained from the type of the result X:
6133         get_vectype_for_scalar_type (TREE_TYPE (X))
6134
6135    This means that, contrary to "regular" reductions (or "regular" stmts in
6136    general), the following equation:
6137       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
6138    does *NOT* necessarily hold for reduction patterns.  */
6139
6140 bool
6141 vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
6142                         stmt_vec_info *vec_stmt, slp_tree slp_node,
6143                         slp_instance slp_node_instance,
6144                         stmt_vector_for_cost *cost_vec)
6145 {
6146   tree vec_dest;
6147   tree scalar_dest;
6148   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6149   tree vectype_in = NULL_TREE;
6150   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6151   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6152   enum tree_code code, orig_code;
6153   internal_fn reduc_fn;
6154   machine_mode vec_mode;
6155   int op_type;
6156   optab optab;
6157   tree new_temp = NULL_TREE;
6158   enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type;
6159   stmt_vec_info cond_stmt_vinfo = NULL;
6160   enum tree_code cond_reduc_op_code = ERROR_MARK;
6161   tree scalar_type;
6162   bool is_simple_use;
6163   int i;
6164   int ncopies;
6165   int epilog_copies;
6166   stmt_vec_info prev_stmt_info, prev_phi_info;
6167   bool single_defuse_cycle = false;
6168   stmt_vec_info new_stmt_info = NULL;
6169   int j;
6170   tree ops[3];
6171   enum vect_def_type dts[3];
6172   bool nested_cycle = false, found_nested_cycle_def = false;
6173   bool double_reduc = false;
6174   basic_block def_bb;
6175   struct loop * def_stmt_loop;
6176   tree def_arg;
6177   auto_vec<tree> vec_oprnds0;
6178   auto_vec<tree> vec_oprnds1;
6179   auto_vec<tree> vec_oprnds2;
6180   auto_vec<tree> vect_defs;
6181   auto_vec<stmt_vec_info> phis;
6182   int vec_num;
6183   tree def0, tem;
6184   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6185   tree cond_reduc_val = NULL_TREE;
6186
6187   /* Make sure it was already recognized as a reduction computation.  */
6188   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
6189       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
6190     return false;
6191
6192   if (nested_in_vect_loop_p (loop, stmt_info))
6193     {
6194       loop = loop->inner;
6195       nested_cycle = true;
6196     }
6197
6198   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6199     gcc_assert (slp_node
6200                 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
6201
6202   if (gphi *phi = dyn_cast <gphi *> (stmt_info->stmt))
6203     {
6204       tree phi_result = gimple_phi_result (phi);
6205       /* Analysis is fully done on the reduction stmt invocation.  */
6206       if (! vec_stmt)
6207         {
6208           if (slp_node)
6209             slp_node_instance->reduc_phis = slp_node;
6210
6211           STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6212           return true;
6213         }
6214
6215       if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6216         /* Leave the scalar phi in place.  Note that checking
6217            STMT_VINFO_VEC_REDUCTION_TYPE (as below) only works
6218            for reductions involving a single statement.  */
6219         return true;
6220
6221       stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
6222       reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
6223
6224       if (STMT_VINFO_VEC_REDUCTION_TYPE (reduc_stmt_info)
6225           == EXTRACT_LAST_REDUCTION)
6226         /* Leave the scalar phi in place.  */
6227         return true;
6228
6229       gassign *reduc_stmt = as_a <gassign *> (reduc_stmt_info->stmt);
6230       code = gimple_assign_rhs_code (reduc_stmt);
6231       for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k)
6232         {
6233           tree op = gimple_op (reduc_stmt, k);
6234           if (op == phi_result)
6235             continue;
6236           if (k == 1 && code == COND_EXPR)
6237             continue;
6238           bool is_simple_use = vect_is_simple_use (op, loop_vinfo, &dt);
6239           gcc_assert (is_simple_use);
6240           if (dt == vect_constant_def || dt == vect_external_def)
6241             continue;
6242           if (!vectype_in
6243               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6244                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (op)))))
6245             vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op));
6246           break;
6247         }
6248       /* For a nested cycle we might end up with an operation like
6249          phi_result * phi_result.  */
6250       if (!vectype_in)
6251         vectype_in = STMT_VINFO_VECTYPE (stmt_info);
6252       gcc_assert (vectype_in);
6253
6254       if (slp_node)
6255         ncopies = 1;
6256       else
6257         ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6258
6259       stmt_vec_info use_stmt_info;
6260       if (ncopies > 1
6261           && STMT_VINFO_RELEVANT (reduc_stmt_info) <= vect_used_only_live
6262           && (use_stmt_info = loop_vinfo->lookup_single_use (phi_result))
6263           && vect_stmt_to_vectorize (use_stmt_info) == reduc_stmt_info)
6264         single_defuse_cycle = true;
6265
6266       /* Create the destination vector  */
6267       scalar_dest = gimple_assign_lhs (reduc_stmt);
6268       vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6269
6270       if (slp_node)
6271         /* The size vect_schedule_slp_instance computes is off for us.  */
6272         vec_num = vect_get_num_vectors
6273           (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6274            * SLP_TREE_SCALAR_STMTS (slp_node).length (),
6275            vectype_in);
6276       else
6277         vec_num = 1;
6278
6279       /* Generate the reduction PHIs upfront.  */
6280       prev_phi_info = NULL;
6281       for (j = 0; j < ncopies; j++)
6282         {
6283           if (j == 0 || !single_defuse_cycle)
6284             {
6285               for (i = 0; i < vec_num; i++)
6286                 {
6287                   /* Create the reduction-phi that defines the reduction
6288                      operand.  */
6289                   gimple *new_phi = create_phi_node (vec_dest, loop->header);
6290                   stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
6291
6292                   if (slp_node)
6293                     SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi_info);
6294                   else
6295                     {
6296                       if (j == 0)
6297                         STMT_VINFO_VEC_STMT (stmt_info)
6298                           = *vec_stmt = new_phi_info;
6299                       else
6300                         STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi_info;
6301                       prev_phi_info = new_phi_info;
6302                     }
6303                 }
6304             }
6305         }
6306
6307       return true;
6308     }
6309
6310   /* 1. Is vectorizable reduction?  */
6311   /* Not supportable if the reduction variable is used in the loop, unless
6312      it's a reduction chain.  */
6313   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6314       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6315     return false;
6316
6317   /* Reductions that are not used even in an enclosing outer-loop,
6318      are expected to be "live" (used out of the loop).  */
6319   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6320       && !STMT_VINFO_LIVE_P (stmt_info))
6321     return false;
6322
6323   /* 2. Has this been recognized as a reduction pattern?
6324
6325      Check if STMT represents a pattern that has been recognized
6326      in earlier analysis stages.  For stmts that represent a pattern,
6327      the STMT_VINFO_RELATED_STMT field records the last stmt in
6328      the original sequence that constitutes the pattern.  */
6329
6330   stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6331   if (orig_stmt_info)
6332     {
6333       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6334       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6335     }
6336
6337   /* 3. Check the operands of the operation.  The first operands are defined
6338         inside the loop body. The last operand is the reduction variable,
6339         which is defined by the loop-header-phi.  */
6340
6341   gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6342
6343   /* Flatten RHS.  */
6344   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
6345     {
6346     case GIMPLE_BINARY_RHS:
6347       code = gimple_assign_rhs_code (stmt);
6348       op_type = TREE_CODE_LENGTH (code);
6349       gcc_assert (op_type == binary_op);
6350       ops[0] = gimple_assign_rhs1 (stmt);
6351       ops[1] = gimple_assign_rhs2 (stmt);
6352       break;
6353
6354     case GIMPLE_TERNARY_RHS:
6355       code = gimple_assign_rhs_code (stmt);
6356       op_type = TREE_CODE_LENGTH (code);
6357       gcc_assert (op_type == ternary_op);
6358       ops[0] = gimple_assign_rhs1 (stmt);
6359       ops[1] = gimple_assign_rhs2 (stmt);
6360       ops[2] = gimple_assign_rhs3 (stmt);
6361       break;
6362
6363     case GIMPLE_UNARY_RHS:
6364       return false;
6365
6366     default:
6367       gcc_unreachable ();
6368     }
6369
6370   if (code == COND_EXPR && slp_node)
6371     return false;
6372
6373   scalar_dest = gimple_assign_lhs (stmt);
6374   scalar_type = TREE_TYPE (scalar_dest);
6375   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6376       && !SCALAR_FLOAT_TYPE_P (scalar_type))
6377     return false;
6378
6379   /* Do not try to vectorize bit-precision reductions.  */
6380   if (!type_has_mode_precision_p (scalar_type))
6381     return false;
6382
6383   /* All uses but the last are expected to be defined in the loop.
6384      The last use is the reduction variable.  In case of nested cycle this
6385      assumption is not true: we use reduc_index to record the index of the
6386      reduction variable.  */
6387   stmt_vec_info reduc_def_info;
6388   if (orig_stmt_info)
6389     reduc_def_info = STMT_VINFO_REDUC_DEF (orig_stmt_info);
6390   else
6391     reduc_def_info = STMT_VINFO_REDUC_DEF (stmt_info);
6392   gcc_assert (reduc_def_info);
6393   gphi *reduc_def_phi = as_a <gphi *> (reduc_def_info->stmt);
6394   tree reduc_def = PHI_RESULT (reduc_def_phi);
6395   int reduc_index = -1;
6396   for (i = 0; i < op_type; i++)
6397     {
6398       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
6399       if (i == 0 && code == COND_EXPR)
6400         continue;
6401
6402       stmt_vec_info def_stmt_info;
6403       is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, &dts[i], &tem,
6404                                           &def_stmt_info);
6405       dt = dts[i];
6406       gcc_assert (is_simple_use);
6407       if (dt == vect_reduction_def
6408           && ops[i] == reduc_def)
6409         {
6410           reduc_index = i;
6411           continue;
6412         }
6413       else if (tem)
6414         {
6415           /* To properly compute ncopies we are interested in the widest
6416              input type in case we're looking at a widening accumulation.  */
6417           if (!vectype_in
6418               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6419                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem)))))
6420             vectype_in = tem;
6421         }
6422
6423       if (dt != vect_internal_def
6424           && dt != vect_external_def
6425           && dt != vect_constant_def
6426           && dt != vect_induction_def
6427           && !(dt == vect_nested_cycle && nested_cycle))
6428         return false;
6429
6430       if (dt == vect_nested_cycle
6431           && ops[i] == reduc_def)
6432         {
6433           found_nested_cycle_def = true;
6434           reduc_index = i;
6435         }
6436
6437       if (i == 1 && code == COND_EXPR)
6438         {
6439           /* Record how value of COND_EXPR is defined.  */
6440           if (dt == vect_constant_def)
6441             {
6442               cond_reduc_dt = dt;
6443               cond_reduc_val = ops[i];
6444             }
6445           if (dt == vect_induction_def
6446               && def_stmt_info
6447               && is_nonwrapping_integer_induction (def_stmt_info, loop))
6448             {
6449               cond_reduc_dt = dt;
6450               cond_stmt_vinfo = def_stmt_info;
6451             }
6452         }
6453     }
6454
6455   if (!vectype_in)
6456     vectype_in = vectype_out;
6457
6458   /* When vectorizing a reduction chain w/o SLP the reduction PHI is not
6459      directy used in stmt.  */
6460   if (reduc_index == -1)
6461     {
6462       if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6463         {
6464           if (dump_enabled_p ())
6465             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6466                              "in-order reduction chain without SLP.\n");
6467           return false;
6468         }
6469     }
6470
6471   if (!(reduc_index == -1
6472         || dts[reduc_index] == vect_reduction_def
6473         || dts[reduc_index] == vect_nested_cycle
6474         || ((dts[reduc_index] == vect_internal_def
6475              || dts[reduc_index] == vect_external_def
6476              || dts[reduc_index] == vect_constant_def
6477              || dts[reduc_index] == vect_induction_def)
6478             && nested_cycle && found_nested_cycle_def)))
6479     {
6480       /* For pattern recognized stmts, orig_stmt might be a reduction,
6481          but some helper statements for the pattern might not, or
6482          might be COND_EXPRs with reduction uses in the condition.  */
6483       gcc_assert (orig_stmt_info);
6484       return false;
6485     }
6486
6487   /* PHIs should not participate in patterns.  */
6488   gcc_assert (!STMT_VINFO_RELATED_STMT (reduc_def_info));
6489   enum vect_reduction_type v_reduc_type
6490     = STMT_VINFO_REDUC_TYPE (reduc_def_info);
6491   stmt_vec_info tmp = STMT_VINFO_REDUC_DEF (reduc_def_info);
6492
6493   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
6494   /* If we have a condition reduction, see if we can simplify it further.  */
6495   if (v_reduc_type == COND_REDUCTION)
6496     {
6497       /* TODO: We can't yet handle reduction chains, since we need to treat
6498          each COND_EXPR in the chain specially, not just the last one.
6499          E.g. for:
6500
6501             x_1 = PHI <x_3, ...>
6502             x_2 = a_2 ? ... : x_1;
6503             x_3 = a_3 ? ... : x_2;
6504
6505          we're interested in the last element in x_3 for which a_2 || a_3
6506          is true, whereas the current reduction chain handling would
6507          vectorize x_2 as a normal VEC_COND_EXPR and only treat x_3
6508          as a reduction operation.  */
6509       if (reduc_index == -1)
6510         {
6511           if (dump_enabled_p ())
6512             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6513                              "conditional reduction chains not supported\n");
6514           return false;
6515         }
6516
6517       /* vect_is_simple_reduction ensured that operand 2 is the
6518          loop-carried operand.  */
6519       gcc_assert (reduc_index == 2);
6520
6521       /* Loop peeling modifies initial value of reduction PHI, which
6522          makes the reduction stmt to be transformed different to the
6523          original stmt analyzed.  We need to record reduction code for
6524          CONST_COND_REDUCTION type reduction at analyzing stage, thus
6525          it can be used directly at transform stage.  */
6526       if (STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MAX_EXPR
6527           || STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MIN_EXPR)
6528         {
6529           /* Also set the reduction type to CONST_COND_REDUCTION.  */
6530           gcc_assert (cond_reduc_dt == vect_constant_def);
6531           STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = CONST_COND_REDUCTION;
6532         }
6533       else if (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6534                                                vectype_in, OPTIMIZE_FOR_SPEED))
6535         {
6536           if (dump_enabled_p ())
6537             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6538                              "optimizing condition reduction with"
6539                              " FOLD_EXTRACT_LAST.\n");
6540           STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = EXTRACT_LAST_REDUCTION;
6541         }
6542       else if (cond_reduc_dt == vect_induction_def)
6543         {
6544           tree base
6545             = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6546           tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6547
6548           gcc_assert (TREE_CODE (base) == INTEGER_CST
6549                       && TREE_CODE (step) == INTEGER_CST);
6550           cond_reduc_val = NULL_TREE;
6551           /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6552              above base; punt if base is the minimum value of the type for
6553              MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
6554           if (tree_int_cst_sgn (step) == -1)
6555             {
6556               cond_reduc_op_code = MIN_EXPR;
6557               if (tree_int_cst_sgn (base) == -1)
6558                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6559               else if (tree_int_cst_lt (base,
6560                                         TYPE_MAX_VALUE (TREE_TYPE (base))))
6561                 cond_reduc_val
6562                   = int_const_binop (PLUS_EXPR, base, integer_one_node);
6563             }
6564           else
6565             {
6566               cond_reduc_op_code = MAX_EXPR;
6567               if (tree_int_cst_sgn (base) == 1)
6568                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6569               else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6570                                         base))
6571                 cond_reduc_val
6572                   = int_const_binop (MINUS_EXPR, base, integer_one_node);
6573             }
6574           if (cond_reduc_val)
6575             {
6576               if (dump_enabled_p ())
6577                 dump_printf_loc (MSG_NOTE, vect_location,
6578                                  "condition expression based on "
6579                                  "integer induction.\n");
6580               STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6581                 = INTEGER_INDUC_COND_REDUCTION;
6582             }
6583         }
6584       else if (cond_reduc_dt == vect_constant_def)
6585         {
6586           enum vect_def_type cond_initial_dt;
6587           gimple *def_stmt = SSA_NAME_DEF_STMT (ops[reduc_index]);
6588           tree cond_initial_val
6589             = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
6590
6591           gcc_assert (cond_reduc_val != NULL_TREE);
6592           vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
6593           if (cond_initial_dt == vect_constant_def
6594               && types_compatible_p (TREE_TYPE (cond_initial_val),
6595                                      TREE_TYPE (cond_reduc_val)))
6596             {
6597               tree e = fold_binary (LE_EXPR, boolean_type_node,
6598                                     cond_initial_val, cond_reduc_val);
6599               if (e && (integer_onep (e) || integer_zerop (e)))
6600                 {
6601                   if (dump_enabled_p ())
6602                     dump_printf_loc (MSG_NOTE, vect_location,
6603                                      "condition expression based on "
6604                                      "compile time constant.\n");
6605                   /* Record reduction code at analysis stage.  */
6606                   STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info)
6607                     = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6608                   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6609                     = CONST_COND_REDUCTION;
6610                 }
6611             }
6612         }
6613     }
6614
6615   if (orig_stmt_info)
6616     gcc_assert (tmp == orig_stmt_info
6617                 || REDUC_GROUP_FIRST_ELEMENT (tmp) == orig_stmt_info);
6618   else
6619     /* We changed STMT to be the first stmt in reduction chain, hence we
6620        check that in this case the first element in the chain is STMT.  */
6621     gcc_assert (tmp == stmt_info
6622                 || REDUC_GROUP_FIRST_ELEMENT (tmp) == stmt_info);
6623
6624   if (STMT_VINFO_LIVE_P (reduc_def_info))
6625     return false;
6626
6627   if (slp_node)
6628     ncopies = 1;
6629   else
6630     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6631
6632   gcc_assert (ncopies >= 1);
6633
6634   vec_mode = TYPE_MODE (vectype_in);
6635   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6636
6637   if (nested_cycle)
6638     {
6639       def_bb = gimple_bb (reduc_def_phi);
6640       def_stmt_loop = def_bb->loop_father;
6641       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
6642                                        loop_preheader_edge (def_stmt_loop));
6643       stmt_vec_info def_arg_stmt_info = loop_vinfo->lookup_def (def_arg);
6644       if (def_arg_stmt_info
6645           && (STMT_VINFO_DEF_TYPE (def_arg_stmt_info)
6646               == vect_double_reduction_def))
6647         double_reduc = true;
6648     }
6649
6650   vect_reduction_type reduction_type
6651     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
6652   if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
6653       && ncopies > 1)
6654     {
6655       if (dump_enabled_p ())
6656         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6657                          "multiple types in double reduction or condition "
6658                          "reduction.\n");
6659       return false;
6660     }
6661
6662   if (code == COND_EXPR)
6663     {
6664       /* Only call during the analysis stage, otherwise we'll lose
6665          STMT_VINFO_TYPE.  */
6666       if (!vec_stmt && !vectorizable_condition (stmt_info, gsi, NULL,
6667                                                 true, NULL, cost_vec))
6668         {
6669           if (dump_enabled_p ())
6670             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6671                              "unsupported condition in reduction\n");
6672           return false;
6673         }
6674     }
6675   else if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
6676            || code == LROTATE_EXPR || code == RROTATE_EXPR)
6677     {
6678       /* Only call during the analysis stage, otherwise we'll lose
6679          STMT_VINFO_TYPE.  We only support this for nested cycles
6680          without double reductions at the moment.  */
6681       if (!nested_cycle
6682           || double_reduc
6683           || (!vec_stmt && !vectorizable_shift (stmt_info, gsi, NULL,
6684                                                 NULL, cost_vec)))
6685         {
6686           if (dump_enabled_p ())
6687             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6688                              "unsupported shift or rotation in reduction\n");
6689           return false;
6690         }
6691     }
6692   else
6693     {
6694       /* 4. Supportable by target?  */
6695
6696       /* 4.1. check support for the operation in the loop  */
6697       optab = optab_for_tree_code (code, vectype_in, optab_default);
6698       if (!optab)
6699         {
6700           if (dump_enabled_p ())
6701             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6702                              "no optab.\n");
6703
6704           return false;
6705         }
6706
6707       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6708         {
6709           if (dump_enabled_p ())
6710             dump_printf (MSG_NOTE, "op not supported by target.\n");
6711
6712           if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
6713               || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6714             return false;
6715
6716           if (dump_enabled_p ())
6717             dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6718         }
6719
6720       /* Worthwhile without SIMD support?  */
6721       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
6722           && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6723         {
6724           if (dump_enabled_p ())
6725             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6726                              "not worthwhile without SIMD support.\n");
6727
6728           return false;
6729         }
6730     }
6731
6732   /* 4.2. Check support for the epilog operation.
6733
6734           If STMT represents a reduction pattern, then the type of the
6735           reduction variable may be different than the type of the rest
6736           of the arguments.  For example, consider the case of accumulation
6737           of shorts into an int accumulator; The original code:
6738                         S1: int_a = (int) short_a;
6739           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
6740
6741           was replaced with:
6742                         STMT: int_acc = widen_sum <short_a, int_acc>
6743
6744           This means that:
6745           1. The tree-code that is used to create the vector operation in the
6746              epilog code (that reduces the partial results) is not the
6747              tree-code of STMT, but is rather the tree-code of the original
6748              stmt from the pattern that STMT is replacing.  I.e, in the example
6749              above we want to use 'widen_sum' in the loop, but 'plus' in the
6750              epilog.
6751           2. The type (mode) we use to check available target support
6752              for the vector operation to be created in the *epilog*, is
6753              determined by the type of the reduction variable (in the example
6754              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6755              However the type (mode) we use to check available target support
6756              for the vector operation to be created *inside the loop*, is
6757              determined by the type of the other arguments to STMT (in the
6758              example we'd check this: optab_handler (widen_sum_optab,
6759              vect_short_mode)).
6760
6761           This is contrary to "regular" reductions, in which the types of all
6762           the arguments are the same as the type of the reduction variable.
6763           For "regular" reductions we can therefore use the same vector type
6764           (and also the same tree-code) when generating the epilog code and
6765           when generating the code inside the loop.  */
6766
6767   if (orig_stmt_info
6768       && (reduction_type == TREE_CODE_REDUCTION
6769           || reduction_type == FOLD_LEFT_REDUCTION))
6770     {
6771       /* This is a reduction pattern: get the vectype from the type of the
6772          reduction variable, and get the tree-code from orig_stmt.  */
6773       orig_code = gimple_assign_rhs_code (orig_stmt_info->stmt);
6774       gcc_assert (vectype_out);
6775       vec_mode = TYPE_MODE (vectype_out);
6776     }
6777   else
6778     {
6779       /* Regular reduction: use the same vectype and tree-code as used for
6780          the vector code inside the loop can be used for the epilog code. */
6781       orig_code = code;
6782
6783       if (code == MINUS_EXPR)
6784         orig_code = PLUS_EXPR;
6785
6786       /* For simple condition reductions, replace with the actual expression
6787          we want to base our reduction around.  */
6788       if (reduction_type == CONST_COND_REDUCTION)
6789         {
6790           orig_code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
6791           gcc_assert (orig_code == MAX_EXPR || orig_code == MIN_EXPR);
6792         }
6793       else if (reduction_type == INTEGER_INDUC_COND_REDUCTION)
6794         orig_code = cond_reduc_op_code;
6795     }
6796
6797   reduc_fn = IFN_LAST;
6798
6799   if (reduction_type == TREE_CODE_REDUCTION
6800       || reduction_type == FOLD_LEFT_REDUCTION
6801       || reduction_type == INTEGER_INDUC_COND_REDUCTION
6802       || reduction_type == CONST_COND_REDUCTION)
6803     {
6804       if (reduction_type == FOLD_LEFT_REDUCTION
6805           ? fold_left_reduction_fn (orig_code, &reduc_fn)
6806           : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6807         {
6808           if (reduc_fn != IFN_LAST
6809               && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6810                                                   OPTIMIZE_FOR_SPEED))
6811             {
6812               if (dump_enabled_p ())
6813                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6814                                  "reduc op not supported by target.\n");
6815
6816               reduc_fn = IFN_LAST;
6817             }
6818         }
6819       else
6820         {
6821           if (!nested_cycle || double_reduc)
6822             {
6823               if (dump_enabled_p ())
6824                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6825                                  "no reduc code for scalar code.\n");
6826
6827               return false;
6828             }
6829         }
6830     }
6831   else if (reduction_type == COND_REDUCTION)
6832     {
6833       int scalar_precision
6834         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6835       cr_index_scalar_type = make_unsigned_type (scalar_precision);
6836       cr_index_vector_type = build_vector_type (cr_index_scalar_type,
6837                                                 nunits_out);
6838
6839       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
6840                                           OPTIMIZE_FOR_SPEED))
6841         reduc_fn = IFN_REDUC_MAX;
6842     }
6843
6844   if (reduction_type != EXTRACT_LAST_REDUCTION
6845       && (!nested_cycle || double_reduc)
6846       && reduc_fn == IFN_LAST
6847       && !nunits_out.is_constant ())
6848     {
6849       if (dump_enabled_p ())
6850         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6851                          "missing target support for reduction on"
6852                          " variable-length vectors.\n");
6853       return false;
6854     }
6855
6856   /* For SLP reductions, see if there is a neutral value we can use.  */
6857   tree neutral_op = NULL_TREE;
6858   if (slp_node)
6859     neutral_op = neutral_op_for_slp_reduction
6860       (slp_node_instance->reduc_phis, code,
6861        REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL);
6862
6863   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
6864     {
6865       /* We can't support in-order reductions of code such as this:
6866
6867            for (int i = 0; i < n1; ++i)
6868              for (int j = 0; j < n2; ++j)
6869                l += a[j];
6870
6871          since GCC effectively transforms the loop when vectorizing:
6872
6873            for (int i = 0; i < n1 / VF; ++i)
6874              for (int j = 0; j < n2; ++j)
6875                for (int k = 0; k < VF; ++k)
6876                  l += a[j];
6877
6878          which is a reassociation of the original operation.  */
6879       if (dump_enabled_p ())
6880         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6881                          "in-order double reduction not supported.\n");
6882
6883       return false;
6884     }
6885
6886   if (reduction_type == FOLD_LEFT_REDUCTION
6887       && slp_node
6888       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6889     {
6890       /* We cannot use in-order reductions in this case because there is
6891          an implicit reassociation of the operations involved.  */
6892       if (dump_enabled_p ())
6893         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6894                          "in-order unchained SLP reductions not supported.\n");
6895       return false;
6896     }
6897
6898   /* For double reductions, and for SLP reductions with a neutral value,
6899      we construct a variable-length initial vector by loading a vector
6900      full of the neutral value and then shift-and-inserting the start
6901      values into the low-numbered elements.  */
6902   if ((double_reduc || neutral_op)
6903       && !nunits_out.is_constant ()
6904       && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
6905                                           vectype_out, OPTIMIZE_FOR_SPEED))
6906     {
6907       if (dump_enabled_p ())
6908         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6909                          "reduction on variable-length vectors requires"
6910                          " target support for a vector-shift-and-insert"
6911                          " operation.\n");
6912       return false;
6913     }
6914
6915   /* Check extra constraints for variable-length unchained SLP reductions.  */
6916   if (STMT_SLP_TYPE (stmt_info)
6917       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6918       && !nunits_out.is_constant ())
6919     {
6920       /* We checked above that we could build the initial vector when
6921          there's a neutral element value.  Check here for the case in
6922          which each SLP statement has its own initial value and in which
6923          that value needs to be repeated for every instance of the
6924          statement within the initial vector.  */
6925       unsigned int group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6926       scalar_mode elt_mode = SCALAR_TYPE_MODE (TREE_TYPE (vectype_out));
6927       if (!neutral_op
6928           && !can_duplicate_and_interleave_p (group_size, elt_mode))
6929         {
6930           if (dump_enabled_p ())
6931             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6932                              "unsupported form of SLP reduction for"
6933                              " variable-length vectors: cannot build"
6934                              " initial vector.\n");
6935           return false;
6936         }
6937       /* The epilogue code relies on the number of elements being a multiple
6938          of the group size.  The duplicate-and-interleave approach to setting
6939          up the the initial vector does too.  */
6940       if (!multiple_p (nunits_out, group_size))
6941         {
6942           if (dump_enabled_p ())
6943             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6944                              "unsupported form of SLP reduction for"
6945                              " variable-length vectors: the vector size"
6946                              " is not a multiple of the number of results.\n");
6947           return false;
6948         }
6949     }
6950
6951   /* In case of widenning multiplication by a constant, we update the type
6952      of the constant to be the type of the other operand.  We check that the
6953      constant fits the type in the pattern recognition pass.  */
6954   if (code == DOT_PROD_EXPR
6955       && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
6956     {
6957       if (TREE_CODE (ops[0]) == INTEGER_CST)
6958         ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
6959       else if (TREE_CODE (ops[1]) == INTEGER_CST)
6960         ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
6961       else
6962         {
6963           if (dump_enabled_p ())
6964             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6965                              "invalid types in dot-prod\n");
6966
6967           return false;
6968         }
6969     }
6970
6971   if (reduction_type == COND_REDUCTION)
6972     {
6973       widest_int ni;
6974
6975       if (! max_loop_iterations (loop, &ni))
6976         {
6977           if (dump_enabled_p ())
6978             dump_printf_loc (MSG_NOTE, vect_location,
6979                              "loop count not known, cannot create cond "
6980                              "reduction.\n");
6981           return false;
6982         }
6983       /* Convert backedges to iterations.  */
6984       ni += 1;
6985
6986       /* The additional index will be the same type as the condition.  Check
6987          that the loop can fit into this less one (because we'll use up the
6988          zero slot for when there are no matches).  */
6989       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
6990       if (wi::geu_p (ni, wi::to_widest (max_index)))
6991         {
6992           if (dump_enabled_p ())
6993             dump_printf_loc (MSG_NOTE, vect_location,
6994                              "loop size is greater than data size.\n");
6995           return false;
6996         }
6997     }
6998
6999   /* In case the vectorization factor (VF) is bigger than the number
7000      of elements that we can fit in a vectype (nunits), we have to generate
7001      more than one vector stmt - i.e - we need to "unroll" the
7002      vector stmt by a factor VF/nunits.  For more details see documentation
7003      in vectorizable_operation.  */
7004
7005   /* If the reduction is used in an outer loop we need to generate
7006      VF intermediate results, like so (e.g. for ncopies=2):
7007         r0 = phi (init, r0)
7008         r1 = phi (init, r1)
7009         r0 = x0 + r0;
7010         r1 = x1 + r1;
7011     (i.e. we generate VF results in 2 registers).
7012     In this case we have a separate def-use cycle for each copy, and therefore
7013     for each copy we get the vector def for the reduction variable from the
7014     respective phi node created for this copy.
7015
7016     Otherwise (the reduction is unused in the loop nest), we can combine
7017     together intermediate results, like so (e.g. for ncopies=2):
7018         r = phi (init, r)
7019         r = x0 + r;
7020         r = x1 + r;
7021    (i.e. we generate VF/2 results in a single register).
7022    In this case for each copy we get the vector def for the reduction variable
7023    from the vectorized reduction operation generated in the previous iteration.
7024
7025    This only works when we see both the reduction PHI and its only consumer
7026    in vectorizable_reduction and there are no intermediate stmts
7027    participating.  */
7028   stmt_vec_info use_stmt_info;
7029   tree reduc_phi_result = gimple_phi_result (reduc_def_phi);
7030   if (ncopies > 1
7031       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
7032       && (use_stmt_info = loop_vinfo->lookup_single_use (reduc_phi_result))
7033       && vect_stmt_to_vectorize (use_stmt_info) == stmt_info)
7034     {
7035       single_defuse_cycle = true;
7036       epilog_copies = 1;
7037     }
7038   else
7039     epilog_copies = ncopies;
7040
7041   /* If the reduction stmt is one of the patterns that have lane
7042      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
7043   if ((ncopies > 1
7044        && ! single_defuse_cycle)
7045       && (code == DOT_PROD_EXPR
7046           || code == WIDEN_SUM_EXPR
7047           || code == SAD_EXPR))
7048     {
7049       if (dump_enabled_p ())
7050         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7051                          "multi def-use cycle not possible for lane-reducing "
7052                          "reduction operation\n");
7053       return false;
7054     }
7055
7056   if (slp_node)
7057     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7058   else
7059     vec_num = 1;
7060
7061   internal_fn cond_fn = get_conditional_internal_fn (code);
7062   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7063   bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
7064
7065   if (!vec_stmt) /* transformation not required.  */
7066     {
7067       vect_model_reduction_cost (stmt_info, reduc_fn, ncopies, cost_vec);
7068       if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
7069         {
7070           if (reduction_type != FOLD_LEFT_REDUCTION
7071               && !mask_by_cond_expr
7072               && (cond_fn == IFN_LAST
7073                   || !direct_internal_fn_supported_p (cond_fn, vectype_in,
7074                                                       OPTIMIZE_FOR_SPEED)))
7075             {
7076               if (dump_enabled_p ())
7077                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7078                                  "can't use a fully-masked loop because no"
7079                                  " conditional operation is available.\n");
7080               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7081             }
7082           else if (reduc_index == -1)
7083             {
7084               if (dump_enabled_p ())
7085                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7086                                  "can't use a fully-masked loop for chained"
7087                                  " reductions.\n");
7088               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7089             }
7090           else
7091             vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
7092                                    vectype_in);
7093         }
7094       if (dump_enabled_p ()
7095           && reduction_type == FOLD_LEFT_REDUCTION)
7096         dump_printf_loc (MSG_NOTE, vect_location,
7097                          "using an in-order (fold-left) reduction.\n");
7098       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7099       return true;
7100     }
7101
7102   /* Transform.  */
7103
7104   if (dump_enabled_p ())
7105     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7106
7107   /* FORNOW: Multiple types are not supported for condition.  */
7108   if (code == COND_EXPR)
7109     gcc_assert (ncopies == 1);
7110
7111   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7112
7113   if (reduction_type == FOLD_LEFT_REDUCTION)
7114     return vectorize_fold_left_reduction
7115       (stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, code,
7116        reduc_fn, ops, vectype_in, reduc_index, masks);
7117
7118   if (reduction_type == EXTRACT_LAST_REDUCTION)
7119     {
7120       gcc_assert (!slp_node);
7121       return vectorizable_condition (stmt_info, gsi, vec_stmt,
7122                                      true, NULL, NULL);
7123     }
7124
7125   /* Create the destination vector  */
7126   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7127
7128   prev_stmt_info = NULL;
7129   prev_phi_info = NULL;
7130   if (!slp_node)
7131     {
7132       vec_oprnds0.create (1);
7133       vec_oprnds1.create (1);
7134       if (op_type == ternary_op)
7135         vec_oprnds2.create (1);
7136     }
7137
7138   phis.create (vec_num);
7139   vect_defs.create (vec_num);
7140   if (!slp_node)
7141     vect_defs.quick_push (NULL_TREE);
7142
7143   if (slp_node)
7144     phis.splice (SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis));
7145   else
7146     phis.quick_push (STMT_VINFO_VEC_STMT (reduc_def_info));
7147
7148   for (j = 0; j < ncopies; j++)
7149     {
7150       if (code == COND_EXPR)
7151         {
7152           gcc_assert (!slp_node);
7153           vectorizable_condition (stmt_info, gsi, vec_stmt,
7154                                   true, NULL, NULL);
7155           break;
7156         }
7157       if (code == LSHIFT_EXPR
7158           || code == RSHIFT_EXPR)
7159         {
7160           vectorizable_shift (stmt_info, gsi, vec_stmt, slp_node, NULL);
7161           break;
7162         }
7163
7164       /* Handle uses.  */
7165       if (j == 0)
7166         {
7167           if (slp_node)
7168             {
7169               /* Get vec defs for all the operands except the reduction index,
7170                  ensuring the ordering of the ops in the vector is kept.  */
7171               auto_vec<tree, 3> slp_ops;
7172               auto_vec<vec<tree>, 3> vec_defs;
7173
7174               slp_ops.quick_push (ops[0]);
7175               slp_ops.quick_push (ops[1]);
7176               if (op_type == ternary_op)
7177                 slp_ops.quick_push (ops[2]);
7178
7179               vect_get_slp_defs (slp_ops, slp_node, &vec_defs);
7180
7181               vec_oprnds0.safe_splice (vec_defs[0]);
7182               vec_defs[0].release ();
7183               vec_oprnds1.safe_splice (vec_defs[1]);
7184               vec_defs[1].release ();
7185               if (op_type == ternary_op)
7186                 {
7187                   vec_oprnds2.safe_splice (vec_defs[2]);
7188                   vec_defs[2].release ();
7189                 }
7190             }
7191           else
7192             {
7193               vec_oprnds0.quick_push
7194                 (vect_get_vec_def_for_operand (ops[0], stmt_info));
7195               vec_oprnds1.quick_push
7196                 (vect_get_vec_def_for_operand (ops[1], stmt_info));
7197               if (op_type == ternary_op)
7198                 vec_oprnds2.quick_push
7199                   (vect_get_vec_def_for_operand (ops[2], stmt_info));
7200             }
7201         }
7202       else
7203         {
7204           if (!slp_node)
7205             {
7206               gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
7207
7208               if (single_defuse_cycle && reduc_index == 0)
7209                 vec_oprnds0[0] = gimple_get_lhs (new_stmt_info->stmt);
7210               else
7211                 vec_oprnds0[0]
7212                   = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7213                                                     vec_oprnds0[0]);
7214               if (single_defuse_cycle && reduc_index == 1)
7215                 vec_oprnds1[0] = gimple_get_lhs (new_stmt_info->stmt);
7216               else
7217                 vec_oprnds1[0]
7218                   = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7219                                                     vec_oprnds1[0]);
7220               if (op_type == ternary_op)
7221                 {
7222                   if (single_defuse_cycle && reduc_index == 2)
7223                     vec_oprnds2[0] = gimple_get_lhs (new_stmt_info->stmt);
7224                   else
7225                     vec_oprnds2[0]
7226                       = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7227                                                         vec_oprnds2[0]);
7228                 }
7229             }
7230         }
7231
7232       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7233         {
7234           tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7235           if (masked_loop_p && !mask_by_cond_expr)
7236             {
7237               /* Make sure that the reduction accumulator is vop[0].  */
7238               if (reduc_index == 1)
7239                 {
7240                   gcc_assert (commutative_tree_code (code));
7241                   std::swap (vop[0], vop[1]);
7242                 }
7243               tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7244                                               vectype_in, i * ncopies + j);
7245               gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
7246                                                         vop[0], vop[1],
7247                                                         vop[0]);
7248               new_temp = make_ssa_name (vec_dest, call);
7249               gimple_call_set_lhs (call, new_temp);
7250               gimple_call_set_nothrow (call, true);
7251               new_stmt_info
7252                 = vect_finish_stmt_generation (stmt_info, call, gsi);
7253             }
7254           else
7255             {
7256               if (op_type == ternary_op)
7257                 vop[2] = vec_oprnds2[i];
7258
7259               if (masked_loop_p && mask_by_cond_expr)
7260                 {
7261                   tree mask = vect_get_loop_mask (gsi, masks,
7262                                                   vec_num * ncopies,
7263                                                   vectype_in, i * ncopies + j);
7264                   build_vect_cond_expr (code, vop, mask, gsi);
7265                 }
7266
7267               gassign *new_stmt = gimple_build_assign (vec_dest, code,
7268                                                        vop[0], vop[1], vop[2]);
7269               new_temp = make_ssa_name (vec_dest, new_stmt);
7270               gimple_assign_set_lhs (new_stmt, new_temp);
7271               new_stmt_info
7272                 = vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
7273             }
7274
7275           if (slp_node)
7276             {
7277               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
7278               vect_defs.quick_push (new_temp);
7279             }
7280           else
7281             vect_defs[0] = new_temp;
7282         }
7283
7284       if (slp_node)
7285         continue;
7286
7287       if (j == 0)
7288         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
7289       else
7290         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
7291
7292       prev_stmt_info = new_stmt_info;
7293     }
7294
7295   /* Finalize the reduction-phi (set its arguments) and create the
7296      epilog reduction code.  */
7297   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
7298     vect_defs[0] = gimple_get_lhs ((*vec_stmt)->stmt);
7299
7300   vect_create_epilog_for_reduction (vect_defs, stmt_info, reduc_def_phi,
7301                                     epilog_copies, reduc_fn, phis,
7302                                     double_reduc, slp_node, slp_node_instance,
7303                                     cond_reduc_val, cond_reduc_op_code,
7304                                     neutral_op);
7305
7306   return true;
7307 }
7308
7309 /* Function vect_min_worthwhile_factor.
7310
7311    For a loop where we could vectorize the operation indicated by CODE,
7312    return the minimum vectorization factor that makes it worthwhile
7313    to use generic vectors.  */
7314 static unsigned int
7315 vect_min_worthwhile_factor (enum tree_code code)
7316 {
7317   switch (code)
7318     {
7319     case PLUS_EXPR:
7320     case MINUS_EXPR:
7321     case NEGATE_EXPR:
7322       return 4;
7323
7324     case BIT_AND_EXPR:
7325     case BIT_IOR_EXPR:
7326     case BIT_XOR_EXPR:
7327     case BIT_NOT_EXPR:
7328       return 2;
7329
7330     default:
7331       return INT_MAX;
7332     }
7333 }
7334
7335 /* Return true if VINFO indicates we are doing loop vectorization and if
7336    it is worth decomposing CODE operations into scalar operations for
7337    that loop's vectorization factor.  */
7338
7339 bool
7340 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
7341 {
7342   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7343   unsigned HOST_WIDE_INT value;
7344   return (loop_vinfo
7345           && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7346           && value >= vect_min_worthwhile_factor (code));
7347 }
7348
7349 /* Function vectorizable_induction
7350
7351    Check if STMT_INFO performs an induction computation that can be vectorized.
7352    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7353    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7354    Return true if STMT_INFO is vectorizable in this way.  */
7355
7356 bool
7357 vectorizable_induction (stmt_vec_info stmt_info,
7358                         gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7359                         stmt_vec_info *vec_stmt, slp_tree slp_node,
7360                         stmt_vector_for_cost *cost_vec)
7361 {
7362   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7363   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7364   unsigned ncopies;
7365   bool nested_in_vect_loop = false;
7366   struct loop *iv_loop;
7367   tree vec_def;
7368   edge pe = loop_preheader_edge (loop);
7369   basic_block new_bb;
7370   tree new_vec, vec_init, vec_step, t;
7371   tree new_name;
7372   gimple *new_stmt;
7373   gphi *induction_phi;
7374   tree induc_def, vec_dest;
7375   tree init_expr, step_expr;
7376   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7377   unsigned i;
7378   tree expr;
7379   gimple_seq stmts;
7380   imm_use_iterator imm_iter;
7381   use_operand_p use_p;
7382   gimple *exit_phi;
7383   edge latch_e;
7384   tree loop_arg;
7385   gimple_stmt_iterator si;
7386
7387   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
7388   if (!phi)
7389     return false;
7390
7391   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7392     return false;
7393
7394   /* Make sure it was recognized as induction computation.  */
7395   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7396     return false;
7397
7398   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7399   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7400
7401   if (slp_node)
7402     ncopies = 1;
7403   else
7404     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7405   gcc_assert (ncopies >= 1);
7406
7407   /* FORNOW. These restrictions should be relaxed.  */
7408   if (nested_in_vect_loop_p (loop, stmt_info))
7409     {
7410       imm_use_iterator imm_iter;
7411       use_operand_p use_p;
7412       gimple *exit_phi;
7413       edge latch_e;
7414       tree loop_arg;
7415
7416       if (ncopies > 1)
7417         {
7418           if (dump_enabled_p ())
7419             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7420                              "multiple types in nested loop.\n");
7421           return false;
7422         }
7423
7424       /* FORNOW: outer loop induction with SLP not supported.  */
7425       if (STMT_SLP_TYPE (stmt_info))
7426         return false;
7427
7428       exit_phi = NULL;
7429       latch_e = loop_latch_edge (loop->inner);
7430       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7431       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7432         {
7433           gimple *use_stmt = USE_STMT (use_p);
7434           if (is_gimple_debug (use_stmt))
7435             continue;
7436
7437           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7438             {
7439               exit_phi = use_stmt;
7440               break;
7441             }
7442         }
7443       if (exit_phi)
7444         {
7445           stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7446           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7447                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7448             {
7449               if (dump_enabled_p ())
7450                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7451                                  "inner-loop induction only used outside "
7452                                  "of the outer vectorized loop.\n");
7453               return false;
7454             }
7455         }
7456
7457       nested_in_vect_loop = true;
7458       iv_loop = loop->inner;
7459     }
7460   else
7461     iv_loop = loop;
7462   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7463
7464   if (slp_node && !nunits.is_constant ())
7465     {
7466       /* The current SLP code creates the initial value element-by-element.  */
7467       if (dump_enabled_p ())
7468         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7469                          "SLP induction not supported for variable-length"
7470                          " vectors.\n");
7471       return false;
7472     }
7473
7474   if (!vec_stmt) /* transformation not required.  */
7475     {
7476       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
7477       DUMP_VECT_SCOPE ("vectorizable_induction");
7478       vect_model_induction_cost (stmt_info, ncopies, cost_vec);
7479       return true;
7480     }
7481
7482   /* Transform.  */
7483
7484   /* Compute a vector variable, initialized with the first VF values of
7485      the induction variable.  E.g., for an iv with IV_PHI='X' and
7486      evolution S, for a vector of 4 units, we want to compute:
7487      [X, X + S, X + 2*S, X + 3*S].  */
7488
7489   if (dump_enabled_p ())
7490     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
7491
7492   latch_e = loop_latch_edge (iv_loop);
7493   loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7494
7495   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
7496   gcc_assert (step_expr != NULL_TREE);
7497
7498   pe = loop_preheader_edge (iv_loop);
7499   init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
7500                                      loop_preheader_edge (iv_loop));
7501
7502   stmts = NULL;
7503   if (!nested_in_vect_loop)
7504     {
7505       /* Convert the initial value to the desired type.  */
7506       tree new_type = TREE_TYPE (vectype);
7507       init_expr = gimple_convert (&stmts, new_type, init_expr);
7508
7509       /* If we are using the loop mask to "peel" for alignment then we need
7510          to adjust the start value here.  */
7511       tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
7512       if (skip_niters != NULL_TREE)
7513         {
7514           if (FLOAT_TYPE_P (vectype))
7515             skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
7516                                         skip_niters);
7517           else
7518             skip_niters = gimple_convert (&stmts, new_type, skip_niters);
7519           tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
7520                                          skip_niters, step_expr);
7521           init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
7522                                     init_expr, skip_step);
7523         }
7524     }
7525
7526   /* Convert the step to the desired type.  */
7527   step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
7528
7529   if (stmts)
7530     {
7531       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7532       gcc_assert (!new_bb);
7533     }
7534
7535   /* Find the first insertion point in the BB.  */
7536   basic_block bb = gimple_bb (phi);
7537   si = gsi_after_labels (bb);
7538
7539   /* For SLP induction we have to generate several IVs as for example
7540      with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
7541      [i + 2*S, i + 3*S, i + 3*S, i + 3*S].  The step is the same uniform
7542      [VF*S, VF*S, VF*S, VF*S] for all.  */
7543   if (slp_node)
7544     {
7545       /* Enforced above.  */
7546       unsigned int const_nunits = nunits.to_constant ();
7547
7548       /* Generate [VF*S, VF*S, ... ].  */
7549       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7550         {
7551           expr = build_int_cst (integer_type_node, vf);
7552           expr = fold_convert (TREE_TYPE (step_expr), expr);
7553         }
7554       else
7555         expr = build_int_cst (TREE_TYPE (step_expr), vf);
7556       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7557                               expr, step_expr);
7558       if (! CONSTANT_CLASS_P (new_name))
7559         new_name = vect_init_vector (stmt_info, new_name,
7560                                      TREE_TYPE (step_expr), NULL);
7561       new_vec = build_vector_from_val (vectype, new_name);
7562       vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7563
7564       /* Now generate the IVs.  */
7565       unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7566       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7567       unsigned elts = const_nunits * nvects;
7568       unsigned nivs = least_common_multiple (group_size,
7569                                              const_nunits) / const_nunits;
7570       gcc_assert (elts % group_size == 0);
7571       tree elt = init_expr;
7572       unsigned ivn;
7573       for (ivn = 0; ivn < nivs; ++ivn)
7574         {
7575           tree_vector_builder elts (vectype, const_nunits, 1);
7576           stmts = NULL;
7577           for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
7578             {
7579               if (ivn*const_nunits + eltn >= group_size
7580                   && (ivn * const_nunits + eltn) % group_size == 0)
7581                 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
7582                                     elt, step_expr);
7583               elts.quick_push (elt);
7584             }
7585           vec_init = gimple_build_vector (&stmts, &elts);
7586           if (stmts)
7587             {
7588               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7589               gcc_assert (!new_bb);
7590             }
7591
7592           /* Create the induction-phi that defines the induction-operand.  */
7593           vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7594           induction_phi = create_phi_node (vec_dest, iv_loop->header);
7595           stmt_vec_info induction_phi_info
7596             = loop_vinfo->add_stmt (induction_phi);
7597           induc_def = PHI_RESULT (induction_phi);
7598
7599           /* Create the iv update inside the loop  */
7600           vec_def = make_ssa_name (vec_dest);
7601           new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7602           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7603           loop_vinfo->add_stmt (new_stmt);
7604
7605           /* Set the arguments of the phi node:  */
7606           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7607           add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7608                        UNKNOWN_LOCATION);
7609
7610           SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi_info);
7611         }
7612
7613       /* Re-use IVs when we can.  */
7614       if (ivn < nvects)
7615         {
7616           unsigned vfp
7617             = least_common_multiple (group_size, const_nunits) / group_size;
7618           /* Generate [VF'*S, VF'*S, ... ].  */
7619           if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7620             {
7621               expr = build_int_cst (integer_type_node, vfp);
7622               expr = fold_convert (TREE_TYPE (step_expr), expr);
7623             }
7624           else
7625             expr = build_int_cst (TREE_TYPE (step_expr), vfp);
7626           new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7627                                   expr, step_expr);
7628           if (! CONSTANT_CLASS_P (new_name))
7629             new_name = vect_init_vector (stmt_info, new_name,
7630                                          TREE_TYPE (step_expr), NULL);
7631           new_vec = build_vector_from_val (vectype, new_name);
7632           vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7633           for (; ivn < nvects; ++ivn)
7634             {
7635               gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs]->stmt;
7636               tree def;
7637               if (gimple_code (iv) == GIMPLE_PHI)
7638                 def = gimple_phi_result (iv);
7639               else
7640                 def = gimple_assign_lhs (iv);
7641               new_stmt = gimple_build_assign (make_ssa_name (vectype),
7642                                               PLUS_EXPR,
7643                                               def, vec_step);
7644               if (gimple_code (iv) == GIMPLE_PHI)
7645                 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7646               else
7647                 {
7648                   gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
7649                   gsi_insert_after (&tgsi, new_stmt, GSI_CONTINUE_LINKING);
7650                 }
7651               SLP_TREE_VEC_STMTS (slp_node).quick_push
7652                 (loop_vinfo->add_stmt (new_stmt));
7653             }
7654         }
7655
7656       return true;
7657     }
7658
7659   /* Create the vector that holds the initial_value of the induction.  */
7660   if (nested_in_vect_loop)
7661     {
7662       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
7663          been created during vectorization of previous stmts.  We obtain it
7664          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
7665       vec_init = vect_get_vec_def_for_operand (init_expr, stmt_info);
7666       /* If the initial value is not of proper type, convert it.  */
7667       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
7668         {
7669           new_stmt
7670             = gimple_build_assign (vect_get_new_ssa_name (vectype,
7671                                                           vect_simple_var,
7672                                                           "vec_iv_"),
7673                                    VIEW_CONVERT_EXPR,
7674                                    build1 (VIEW_CONVERT_EXPR, vectype,
7675                                            vec_init));
7676           vec_init = gimple_assign_lhs (new_stmt);
7677           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
7678                                                  new_stmt);
7679           gcc_assert (!new_bb);
7680           loop_vinfo->add_stmt (new_stmt);
7681         }
7682     }
7683   else
7684     {
7685       /* iv_loop is the loop to be vectorized. Create:
7686          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
7687       stmts = NULL;
7688       new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
7689
7690       unsigned HOST_WIDE_INT const_nunits;
7691       if (nunits.is_constant (&const_nunits))
7692         {
7693           tree_vector_builder elts (vectype, const_nunits, 1);
7694           elts.quick_push (new_name);
7695           for (i = 1; i < const_nunits; i++)
7696             {
7697               /* Create: new_name_i = new_name + step_expr  */
7698               new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
7699                                        new_name, step_expr);
7700               elts.quick_push (new_name);
7701             }
7702           /* Create a vector from [new_name_0, new_name_1, ...,
7703              new_name_nunits-1]  */
7704           vec_init = gimple_build_vector (&stmts, &elts);
7705         }
7706       else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
7707         /* Build the initial value directly from a VEC_SERIES_EXPR.  */
7708         vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, vectype,
7709                                  new_name, step_expr);
7710       else
7711         {
7712           /* Build:
7713                 [base, base, base, ...]
7714                 + (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
7715           gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
7716           gcc_assert (flag_associative_math);
7717           tree index = build_index_vector (vectype, 0, 1);
7718           tree base_vec = gimple_build_vector_from_val (&stmts, vectype,
7719                                                         new_name);
7720           tree step_vec = gimple_build_vector_from_val (&stmts, vectype,
7721                                                         step_expr);
7722           vec_init = gimple_build (&stmts, FLOAT_EXPR, vectype, index);
7723           vec_init = gimple_build (&stmts, MULT_EXPR, vectype,
7724                                    vec_init, step_vec);
7725           vec_init = gimple_build (&stmts, PLUS_EXPR, vectype,
7726                                    vec_init, base_vec);
7727         }
7728
7729       if (stmts)
7730         {
7731           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7732           gcc_assert (!new_bb);
7733         }
7734     }
7735
7736
7737   /* Create the vector that holds the step of the induction.  */
7738   if (nested_in_vect_loop)
7739     /* iv_loop is nested in the loop to be vectorized. Generate:
7740        vec_step = [S, S, S, S]  */
7741     new_name = step_expr;
7742   else
7743     {
7744       /* iv_loop is the loop to be vectorized. Generate:
7745           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
7746       gimple_seq seq = NULL;
7747       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7748         {
7749           expr = build_int_cst (integer_type_node, vf);
7750           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7751         }
7752       else
7753         expr = build_int_cst (TREE_TYPE (step_expr), vf);
7754       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7755                                expr, step_expr);
7756       if (seq)
7757         {
7758           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7759           gcc_assert (!new_bb);
7760         }
7761     }
7762
7763   t = unshare_expr (new_name);
7764   gcc_assert (CONSTANT_CLASS_P (new_name)
7765               || TREE_CODE (new_name) == SSA_NAME);
7766   new_vec = build_vector_from_val (vectype, t);
7767   vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7768
7769
7770   /* Create the following def-use cycle:
7771      loop prolog:
7772          vec_init = ...
7773          vec_step = ...
7774      loop:
7775          vec_iv = PHI <vec_init, vec_loop>
7776          ...
7777          STMT
7778          ...
7779          vec_loop = vec_iv + vec_step;  */
7780
7781   /* Create the induction-phi that defines the induction-operand.  */
7782   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7783   induction_phi = create_phi_node (vec_dest, iv_loop->header);
7784   stmt_vec_info induction_phi_info = loop_vinfo->add_stmt (induction_phi);
7785   induc_def = PHI_RESULT (induction_phi);
7786
7787   /* Create the iv update inside the loop  */
7788   vec_def = make_ssa_name (vec_dest);
7789   new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7790   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7791   stmt_vec_info new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7792
7793   /* Set the arguments of the phi node:  */
7794   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7795   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7796                UNKNOWN_LOCATION);
7797
7798   STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi_info;
7799
7800   /* In case that vectorization factor (VF) is bigger than the number
7801      of elements that we can fit in a vectype (nunits), we have to generate
7802      more than one vector stmt - i.e - we need to "unroll" the
7803      vector stmt by a factor VF/nunits.  For more details see documentation
7804      in vectorizable_operation.  */
7805
7806   if (ncopies > 1)
7807     {
7808       gimple_seq seq = NULL;
7809       stmt_vec_info prev_stmt_vinfo;
7810       /* FORNOW. This restriction should be relaxed.  */
7811       gcc_assert (!nested_in_vect_loop);
7812
7813       /* Create the vector that holds the step of the induction.  */
7814       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7815         {
7816           expr = build_int_cst (integer_type_node, nunits);
7817           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7818         }
7819       else
7820         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
7821       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7822                                expr, step_expr);
7823       if (seq)
7824         {
7825           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7826           gcc_assert (!new_bb);
7827         }
7828
7829       t = unshare_expr (new_name);
7830       gcc_assert (CONSTANT_CLASS_P (new_name)
7831                   || TREE_CODE (new_name) == SSA_NAME);
7832       new_vec = build_vector_from_val (vectype, t);
7833       vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7834
7835       vec_def = induc_def;
7836       prev_stmt_vinfo = induction_phi_info;
7837       for (i = 1; i < ncopies; i++)
7838         {
7839           /* vec_i = vec_prev + vec_step  */
7840           new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
7841                                           vec_def, vec_step);
7842           vec_def = make_ssa_name (vec_dest, new_stmt);
7843           gimple_assign_set_lhs (new_stmt, vec_def);
7844
7845           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7846           new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7847           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt_info;
7848           prev_stmt_vinfo = new_stmt_info;
7849         }
7850     }
7851
7852   if (nested_in_vect_loop)
7853     {
7854       /* Find the loop-closed exit-phi of the induction, and record
7855          the final vector of induction results:  */
7856       exit_phi = NULL;
7857       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7858         {
7859           gimple *use_stmt = USE_STMT (use_p);
7860           if (is_gimple_debug (use_stmt))
7861             continue;
7862
7863           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
7864             {
7865               exit_phi = use_stmt;
7866               break;
7867             }
7868         }
7869       if (exit_phi)
7870         {
7871           stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7872           /* FORNOW. Currently not supporting the case that an inner-loop induction
7873              is not used in the outer-loop (i.e. only outside the outer-loop).  */
7874           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
7875                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
7876
7877           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt_info;
7878           if (dump_enabled_p ())
7879             dump_printf_loc (MSG_NOTE, vect_location,
7880                              "vector of inductions after inner-loop:%G",
7881                              new_stmt);
7882         }
7883     }
7884
7885
7886   if (dump_enabled_p ())
7887     dump_printf_loc (MSG_NOTE, vect_location,
7888                      "transform induction: created def-use cycle: %G%G",
7889                      induction_phi, SSA_NAME_DEF_STMT (vec_def));
7890
7891   return true;
7892 }
7893
7894 /* Function vectorizable_live_operation.
7895
7896    STMT_INFO computes a value that is used outside the loop.  Check if
7897    it can be supported.  */
7898
7899 bool
7900 vectorizable_live_operation (stmt_vec_info stmt_info,
7901                              gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7902                              slp_tree slp_node, int slp_index,
7903                              stmt_vec_info *vec_stmt,
7904                              stmt_vector_for_cost *)
7905 {
7906   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7907   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7908   imm_use_iterator imm_iter;
7909   tree lhs, lhs_type, bitsize, vec_bitsize;
7910   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7911   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7912   int ncopies;
7913   gimple *use_stmt;
7914   auto_vec<tree> vec_oprnds;
7915   int vec_entry = 0;
7916   poly_uint64 vec_index = 0;
7917
7918   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
7919
7920   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
7921     return false;
7922
7923   /* FORNOW.  CHECKME.  */
7924   if (nested_in_vect_loop_p (loop, stmt_info))
7925     return false;
7926
7927   /* If STMT is not relevant and it is a simple assignment and its inputs are
7928      invariant then it can remain in place, unvectorized.  The original last
7929      scalar value that it computes will be used.  */
7930   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7931     {
7932       gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
7933       if (dump_enabled_p ())
7934         dump_printf_loc (MSG_NOTE, vect_location,
7935                          "statement is simple and uses invariant.  Leaving in "
7936                          "place.\n");
7937       return true;
7938     }
7939
7940   if (slp_node)
7941     ncopies = 1;
7942   else
7943     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7944
7945   if (slp_node)
7946     {
7947       gcc_assert (slp_index >= 0);
7948
7949       int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7950       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7951
7952       /* Get the last occurrence of the scalar index from the concatenation of
7953          all the slp vectors. Calculate which slp vector it is and the index
7954          within.  */
7955       poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
7956
7957       /* Calculate which vector contains the result, and which lane of
7958          that vector we need.  */
7959       if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
7960         {
7961           if (dump_enabled_p ())
7962             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7963                              "Cannot determine which vector holds the"
7964                              " final result.\n");
7965           return false;
7966         }
7967     }
7968
7969   if (!vec_stmt)
7970     {
7971       /* No transformation required.  */
7972       if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
7973         {
7974           if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
7975                                                OPTIMIZE_FOR_SPEED))
7976             {
7977               if (dump_enabled_p ())
7978                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7979                                  "can't use a fully-masked loop because "
7980                                  "the target doesn't support extract last "
7981                                  "reduction.\n");
7982               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7983             }
7984           else if (slp_node)
7985             {
7986               if (dump_enabled_p ())
7987                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7988                                  "can't use a fully-masked loop because an "
7989                                  "SLP statement is live after the loop.\n");
7990               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7991             }
7992           else if (ncopies > 1)
7993             {
7994               if (dump_enabled_p ())
7995                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7996                                  "can't use a fully-masked loop because"
7997                                  " ncopies is greater than 1.\n");
7998               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7999             }
8000           else
8001             {
8002               gcc_assert (ncopies == 1 && !slp_node);
8003               vect_record_loop_mask (loop_vinfo,
8004                                      &LOOP_VINFO_MASKS (loop_vinfo),
8005                                      1, vectype);
8006             }
8007         }
8008       return true;
8009     }
8010
8011   /* Use the lhs of the original scalar statement.  */
8012   gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
8013
8014   lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
8015         : gimple_get_lhs (stmt);
8016   lhs_type = TREE_TYPE (lhs);
8017
8018   bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype)
8019              ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype)))
8020              : TYPE_SIZE (TREE_TYPE (vectype)));
8021   vec_bitsize = TYPE_SIZE (vectype);
8022
8023   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
8024   tree vec_lhs, bitstart;
8025   if (slp_node)
8026     {
8027       gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8028
8029       /* Get the correct slp vectorized stmt.  */
8030       gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry]->stmt;
8031       if (gphi *phi = dyn_cast <gphi *> (vec_stmt))
8032         vec_lhs = gimple_phi_result (phi);
8033       else
8034         vec_lhs = gimple_get_lhs (vec_stmt);
8035
8036       /* Get entry to use.  */
8037       bitstart = bitsize_int (vec_index);
8038       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
8039     }
8040   else
8041     {
8042       enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
8043       vec_lhs = vect_get_vec_def_for_operand_1 (stmt_info, dt);
8044       gcc_checking_assert (ncopies == 1
8045                            || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8046
8047       /* For multiple copies, get the last copy.  */
8048       for (int i = 1; i < ncopies; ++i)
8049         vec_lhs = vect_get_vec_def_for_stmt_copy (loop_vinfo, vec_lhs);
8050
8051       /* Get the last lane in the vector.  */
8052       bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
8053     }
8054
8055   gimple_seq stmts = NULL;
8056   tree new_tree;
8057   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8058     {
8059       /* Emit:
8060
8061            SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
8062
8063          where VEC_LHS is the vectorized live-out result and MASK is
8064          the loop mask for the final iteration.  */
8065       gcc_assert (ncopies == 1 && !slp_node);
8066       tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
8067       tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
8068                                       1, vectype, 0);
8069       tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST,
8070                                       scalar_type, mask, vec_lhs);
8071
8072       /* Convert the extracted vector element to the required scalar type.  */
8073       new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
8074     }
8075   else
8076     {
8077       tree bftype = TREE_TYPE (vectype);
8078       if (VECTOR_BOOLEAN_TYPE_P (vectype))
8079         bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8080       new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs, bitsize, bitstart);
8081       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8082                                        &stmts, true, NULL_TREE);
8083     }
8084
8085   if (stmts)
8086     gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts);
8087
8088   /* Replace use of lhs with newly computed result.  If the use stmt is a
8089      single arg PHI, just replace all uses of PHI result.  It's necessary
8090      because lcssa PHI defining lhs may be before newly inserted stmt.  */
8091   use_operand_p use_p;
8092   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8093     if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
8094         && !is_gimple_debug (use_stmt))
8095     {
8096       if (gimple_code (use_stmt) == GIMPLE_PHI
8097           && gimple_phi_num_args (use_stmt) == 1)
8098         {
8099           replace_uses_by (gimple_phi_result (use_stmt), new_tree);
8100         }
8101       else
8102         {
8103           FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8104             SET_USE (use_p, new_tree);
8105         }
8106       update_stmt (use_stmt);
8107     }
8108
8109   return true;
8110 }
8111
8112 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO.  */
8113
8114 static void
8115 vect_loop_kill_debug_uses (struct loop *loop, stmt_vec_info stmt_info)
8116 {
8117   ssa_op_iter op_iter;
8118   imm_use_iterator imm_iter;
8119   def_operand_p def_p;
8120   gimple *ustmt;
8121
8122   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
8123     {
8124       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
8125         {
8126           basic_block bb;
8127
8128           if (!is_gimple_debug (ustmt))
8129             continue;
8130
8131           bb = gimple_bb (ustmt);
8132
8133           if (!flow_bb_inside_loop_p (loop, bb))
8134             {
8135               if (gimple_debug_bind_p (ustmt))
8136                 {
8137                   if (dump_enabled_p ())
8138                     dump_printf_loc (MSG_NOTE, vect_location,
8139                                      "killing debug use\n");
8140
8141                   gimple_debug_bind_reset_value (ustmt);
8142                   update_stmt (ustmt);
8143                 }
8144               else
8145                 gcc_unreachable ();
8146             }
8147         }
8148     }
8149 }
8150
8151 /* Given loop represented by LOOP_VINFO, return true if computation of
8152    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
8153    otherwise.  */
8154
8155 static bool
8156 loop_niters_no_overflow (loop_vec_info loop_vinfo)
8157 {
8158   /* Constant case.  */
8159   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8160     {
8161       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
8162       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
8163
8164       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
8165       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
8166       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
8167         return true;
8168     }
8169
8170   widest_int max;
8171   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8172   /* Check the upper bound of loop niters.  */
8173   if (get_max_loop_iterations (loop, &max))
8174     {
8175       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
8176       signop sgn = TYPE_SIGN (type);
8177       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
8178       if (max < type_max)
8179         return true;
8180     }
8181   return false;
8182 }
8183
8184 /* Return a mask type with half the number of elements as TYPE.  */
8185
8186 tree
8187 vect_halve_mask_nunits (tree type)
8188 {
8189   poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (type), 2);
8190   return build_truth_vector_type (nunits, current_vector_size);
8191 }
8192
8193 /* Return a mask type with twice as many elements as TYPE.  */
8194
8195 tree
8196 vect_double_mask_nunits (tree type)
8197 {
8198   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (type) * 2;
8199   return build_truth_vector_type (nunits, current_vector_size);
8200 }
8201
8202 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8203    contain a sequence of NVECTORS masks that each control a vector of type
8204    VECTYPE.  */
8205
8206 void
8207 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
8208                        unsigned int nvectors, tree vectype)
8209 {
8210   gcc_assert (nvectors != 0);
8211   if (masks->length () < nvectors)
8212     masks->safe_grow_cleared (nvectors);
8213   rgroup_masks *rgm = &(*masks)[nvectors - 1];
8214   /* The number of scalars per iteration and the number of vectors are
8215      both compile-time constants.  */
8216   unsigned int nscalars_per_iter
8217     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8218                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8219   if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8220     {
8221       rgm->max_nscalars_per_iter = nscalars_per_iter;
8222       rgm->mask_type = build_same_sized_truth_vector_type (vectype);
8223     }
8224 }
8225
8226 /* Given a complete set of masks MASKS, extract mask number INDEX
8227    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8228    where 0 <= INDEX < NVECTORS.  Insert any set-up statements before GSI.
8229
8230    See the comment above vec_loop_masks for more details about the mask
8231    arrangement.  */
8232
8233 tree
8234 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
8235                     unsigned int nvectors, tree vectype, unsigned int index)
8236 {
8237   rgroup_masks *rgm = &(*masks)[nvectors - 1];
8238   tree mask_type = rgm->mask_type;
8239
8240   /* Populate the rgroup's mask array, if this is the first time we've
8241      used it.  */
8242   if (rgm->masks.is_empty ())
8243     {
8244       rgm->masks.safe_grow_cleared (nvectors);
8245       for (unsigned int i = 0; i < nvectors; ++i)
8246         {
8247           tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
8248           /* Provide a dummy definition until the real one is available.  */
8249           SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
8250           rgm->masks[i] = mask;
8251         }
8252     }
8253
8254   tree mask = rgm->masks[index];
8255   if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
8256                 TYPE_VECTOR_SUBPARTS (vectype)))
8257     {
8258       /* A loop mask for data type X can be reused for data type Y
8259          if X has N times more elements than Y and if Y's elements
8260          are N times bigger than X's.  In this case each sequence
8261          of N elements in the loop mask will be all-zero or all-one.
8262          We can then view-convert the mask so that each sequence of
8263          N elements is replaced by a single element.  */
8264       gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
8265                               TYPE_VECTOR_SUBPARTS (vectype)));
8266       gimple_seq seq = NULL;
8267       mask_type = build_same_sized_truth_vector_type (vectype);
8268       mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
8269       if (seq)
8270         gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
8271     }
8272   return mask;
8273 }
8274
8275 /* Scale profiling counters by estimation for LOOP which is vectorized
8276    by factor VF.  */
8277
8278 static void
8279 scale_profile_for_vect_loop (struct loop *loop, unsigned vf)
8280 {
8281   edge preheader = loop_preheader_edge (loop);
8282   /* Reduce loop iterations by the vectorization factor.  */
8283   gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
8284   profile_count freq_h = loop->header->count, freq_e = preheader->count ();
8285
8286   if (freq_h.nonzero_p ())
8287     {
8288       profile_probability p;
8289
8290       /* Avoid dropping loop body profile counter to 0 because of zero count
8291          in loop's preheader.  */
8292       if (!(freq_e == profile_count::zero ()))
8293         freq_e = freq_e.force_nonzero ();
8294       p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
8295       scale_loop_frequencies (loop, p);
8296     }
8297
8298   edge exit_e = single_exit (loop);
8299   exit_e->probability = profile_probability::always ()
8300                                  .apply_scale (1, new_est_niter + 1);
8301
8302   edge exit_l = single_pred_edge (loop->latch);
8303   profile_probability prob = exit_l->probability;
8304   exit_l->probability = exit_e->probability.invert ();
8305   if (prob.initialized_p () && exit_l->probability.initialized_p ())
8306     scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
8307 }
8308
8309 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
8310    When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
8311    stmt_vec_info.  */
8312
8313 static void
8314 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8315                           gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
8316 {
8317   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8318   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8319
8320   if (dump_enabled_p ())
8321     dump_printf_loc (MSG_NOTE, vect_location,
8322                      "------>vectorizing statement: %G", stmt_info->stmt);
8323
8324   if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8325     vect_loop_kill_debug_uses (loop, stmt_info);
8326
8327   if (!STMT_VINFO_RELEVANT_P (stmt_info)
8328       && !STMT_VINFO_LIVE_P (stmt_info))
8329     return;
8330
8331   if (STMT_VINFO_VECTYPE (stmt_info))
8332     {
8333       poly_uint64 nunits
8334         = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8335       if (!STMT_SLP_TYPE (stmt_info)
8336           && maybe_ne (nunits, vf)
8337           && dump_enabled_p ())
8338         /* For SLP VF is set according to unrolling factor, and not
8339            to vector size, hence for SLP this print is not valid.  */
8340         dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8341     }
8342
8343   /* Pure SLP statements have already been vectorized.  We still need
8344      to apply loop vectorization to hybrid SLP statements.  */
8345   if (PURE_SLP_STMT (stmt_info))
8346     return;
8347
8348   if (dump_enabled_p ())
8349     dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
8350
8351   if (vect_transform_stmt (stmt_info, gsi, NULL, NULL))
8352     *seen_store = stmt_info;
8353 }
8354
8355 /* Function vect_transform_loop.
8356
8357    The analysis phase has determined that the loop is vectorizable.
8358    Vectorize the loop - created vectorized stmts to replace the scalar
8359    stmts in the loop, and update the loop exit condition.
8360    Returns scalar epilogue loop if any.  */
8361
8362 struct loop *
8363 vect_transform_loop (loop_vec_info loop_vinfo)
8364 {
8365   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8366   struct loop *epilogue = NULL;
8367   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8368   int nbbs = loop->num_nodes;
8369   int i;
8370   tree niters_vector = NULL_TREE;
8371   tree step_vector = NULL_TREE;
8372   tree niters_vector_mult_vf = NULL_TREE;
8373   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8374   unsigned int lowest_vf = constant_lower_bound (vf);
8375   gimple *stmt;
8376   bool check_profitability = false;
8377   unsigned int th;
8378
8379   DUMP_VECT_SCOPE ("vec_transform_loop");
8380
8381   loop_vinfo->shared->check_datarefs ();
8382
8383   /* Use the more conservative vectorization threshold.  If the number
8384      of iterations is constant assume the cost check has been performed
8385      by our caller.  If the threshold makes all loops profitable that
8386      run at least the (estimated) vectorization factor number of times
8387      checking is pointless, too.  */
8388   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
8389   if (th >= vect_vf_for_cost (loop_vinfo)
8390       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8391     {
8392       if (dump_enabled_p ())
8393         dump_printf_loc (MSG_NOTE, vect_location,
8394                          "Profitability threshold is %d loop iterations.\n",
8395                          th);
8396       check_profitability = true;
8397     }
8398
8399   /* Make sure there exists a single-predecessor exit bb.  Do this before
8400      versioning.   */
8401   edge e = single_exit (loop);
8402   if (! single_pred_p (e->dest))
8403     {
8404       split_loop_exit_edge (e, true);
8405       if (dump_enabled_p ())
8406         dump_printf (MSG_NOTE, "split exit edge\n");
8407     }
8408
8409   /* Version the loop first, if required, so the profitability check
8410      comes first.  */
8411
8412   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
8413     {
8414       poly_uint64 versioning_threshold
8415         = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
8416       if (check_profitability
8417           && ordered_p (poly_uint64 (th), versioning_threshold))
8418         {
8419           versioning_threshold = ordered_max (poly_uint64 (th),
8420                                               versioning_threshold);
8421           check_profitability = false;
8422         }
8423       struct loop *sloop
8424         = vect_loop_versioning (loop_vinfo, th, check_profitability,
8425                                 versioning_threshold);
8426       sloop->force_vectorize = false;
8427       check_profitability = false;
8428     }
8429
8430   /* Make sure there exists a single-predecessor exit bb also on the
8431      scalar loop copy.  Do this after versioning but before peeling
8432      so CFG structure is fine for both scalar and if-converted loop
8433      to make slpeel_duplicate_current_defs_from_edges face matched
8434      loop closed PHI nodes on the exit.  */
8435   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8436     {
8437       e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
8438       if (! single_pred_p (e->dest))
8439         {
8440           split_loop_exit_edge (e, true);
8441           if (dump_enabled_p ())
8442             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
8443         }
8444     }
8445
8446   tree niters = vect_build_loop_niters (loop_vinfo);
8447   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
8448   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
8449   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
8450   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
8451                               &step_vector, &niters_vector_mult_vf, th,
8452                               check_profitability, niters_no_overflow);
8453
8454   if (niters_vector == NULL_TREE)
8455     {
8456       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8457           && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8458           && known_eq (lowest_vf, vf))
8459         {
8460           niters_vector
8461             = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
8462                              LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
8463           step_vector = build_one_cst (TREE_TYPE (niters));
8464         }
8465       else
8466         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
8467                                      &step_vector, niters_no_overflow);
8468     }
8469
8470   /* 1) Make sure the loop header has exactly two entries
8471      2) Make sure we have a preheader basic block.  */
8472
8473   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
8474
8475   split_edge (loop_preheader_edge (loop));
8476
8477   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8478       && vect_use_loop_mask_for_alignment_p (loop_vinfo))
8479     /* This will deal with any possible peeling.  */
8480     vect_prepare_for_masked_peels (loop_vinfo);
8481
8482   /* Schedule the SLP instances first, then handle loop vectorization
8483      below.  */
8484   if (!loop_vinfo->slp_instances.is_empty ())
8485     {
8486       DUMP_VECT_SCOPE ("scheduling SLP instances");
8487       vect_schedule_slp (loop_vinfo);
8488     }
8489
8490   /* FORNOW: the vectorizer supports only loops which body consist
8491      of one basic block (header + empty latch). When the vectorizer will
8492      support more involved loop forms, the order by which the BBs are
8493      traversed need to be reconsidered.  */
8494
8495   for (i = 0; i < nbbs; i++)
8496     {
8497       basic_block bb = bbs[i];
8498       stmt_vec_info stmt_info;
8499
8500       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
8501            gsi_next (&si))
8502         {
8503           gphi *phi = si.phi ();
8504           if (dump_enabled_p ())
8505             dump_printf_loc (MSG_NOTE, vect_location,
8506                              "------>vectorizing phi: %G", phi);
8507           stmt_info = loop_vinfo->lookup_stmt (phi);
8508           if (!stmt_info)
8509             continue;
8510
8511           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8512             vect_loop_kill_debug_uses (loop, stmt_info);
8513
8514           if (!STMT_VINFO_RELEVANT_P (stmt_info)
8515               && !STMT_VINFO_LIVE_P (stmt_info))
8516             continue;
8517
8518           if (STMT_VINFO_VECTYPE (stmt_info)
8519               && (maybe_ne
8520                   (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
8521               && dump_enabled_p ())
8522             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8523
8524           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
8525                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8526                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
8527               && ! PURE_SLP_STMT (stmt_info))
8528             {
8529               if (dump_enabled_p ())
8530                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
8531               vect_transform_stmt (stmt_info, NULL, NULL, NULL);
8532             }
8533         }
8534
8535       for (gimple_stmt_iterator si = gsi_start_bb (bb);
8536            !gsi_end_p (si);)
8537         {
8538           stmt = gsi_stmt (si);
8539           /* During vectorization remove existing clobber stmts.  */
8540           if (gimple_clobber_p (stmt))
8541             {
8542               unlink_stmt_vdef (stmt);
8543               gsi_remove (&si, true);
8544               release_defs (stmt);
8545             }
8546           else
8547             {
8548               stmt_info = loop_vinfo->lookup_stmt (stmt);
8549
8550               /* vector stmts created in the outer-loop during vectorization of
8551                  stmts in an inner-loop may not have a stmt_info, and do not
8552                  need to be vectorized.  */
8553               stmt_vec_info seen_store = NULL;
8554               if (stmt_info)
8555                 {
8556                   if (STMT_VINFO_IN_PATTERN_P (stmt_info))
8557                     {
8558                       gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
8559                       for (gimple_stmt_iterator subsi = gsi_start (def_seq);
8560                            !gsi_end_p (subsi); gsi_next (&subsi))
8561                         {
8562                           stmt_vec_info pat_stmt_info
8563                             = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
8564                           vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
8565                                                     &si, &seen_store);
8566                         }
8567                       stmt_vec_info pat_stmt_info
8568                         = STMT_VINFO_RELATED_STMT (stmt_info);
8569                       vect_transform_loop_stmt (loop_vinfo, pat_stmt_info, &si,
8570                                                 &seen_store);
8571                     }
8572                   vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
8573                                             &seen_store);
8574                 }
8575               gsi_next (&si);
8576               if (seen_store)
8577                 {
8578                   if (STMT_VINFO_GROUPED_ACCESS (seen_store))
8579                     /* Interleaving.  If IS_STORE is TRUE, the
8580                        vectorization of the interleaving chain was
8581                        completed - free all the stores in the chain.  */
8582                     vect_remove_stores (DR_GROUP_FIRST_ELEMENT (seen_store));
8583                   else
8584                     /* Free the attached stmt_vec_info and remove the stmt.  */
8585                     loop_vinfo->remove_stmt (stmt_info);
8586                 }
8587             }
8588         }
8589
8590       /* Stub out scalar statements that must not survive vectorization.
8591          Doing this here helps with grouped statements, or statements that
8592          are involved in patterns.  */
8593       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
8594            !gsi_end_p (gsi); gsi_next (&gsi))
8595         {
8596           gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
8597           if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
8598             {
8599               tree lhs = gimple_get_lhs (call);
8600               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8601                 {
8602                   tree zero = build_zero_cst (TREE_TYPE (lhs));
8603                   gimple *new_stmt = gimple_build_assign (lhs, zero);
8604                   gsi_replace (&gsi, new_stmt, true);
8605                 }
8606             }
8607         }
8608     }                           /* BBs in loop */
8609
8610   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
8611      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
8612   if (integer_onep (step_vector))
8613     niters_no_overflow = true;
8614   vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
8615                            niters_vector_mult_vf, !niters_no_overflow);
8616
8617   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
8618   scale_profile_for_vect_loop (loop, assumed_vf);
8619
8620   /* True if the final iteration might not handle a full vector's
8621      worth of scalar iterations.  */
8622   bool final_iter_may_be_partial = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8623   /* The minimum number of iterations performed by the epilogue.  This
8624      is 1 when peeling for gaps because we always need a final scalar
8625      iteration.  */
8626   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
8627   /* +1 to convert latch counts to loop iteration counts,
8628      -min_epilogue_iters to remove iterations that cannot be performed
8629        by the vector code.  */
8630   int bias_for_lowest = 1 - min_epilogue_iters;
8631   int bias_for_assumed = bias_for_lowest;
8632   int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
8633   if (alignment_npeels && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8634     {
8635       /* When the amount of peeling is known at compile time, the first
8636          iteration will have exactly alignment_npeels active elements.
8637          In the worst case it will have at least one.  */
8638       int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
8639       bias_for_lowest += lowest_vf - min_first_active;
8640       bias_for_assumed += assumed_vf - min_first_active;
8641     }
8642   /* In these calculations the "- 1" converts loop iteration counts
8643      back to latch counts.  */
8644   if (loop->any_upper_bound)
8645     loop->nb_iterations_upper_bound
8646       = (final_iter_may_be_partial
8647          ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
8648                           lowest_vf) - 1
8649          : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
8650                            lowest_vf) - 1);
8651   if (loop->any_likely_upper_bound)
8652     loop->nb_iterations_likely_upper_bound
8653       = (final_iter_may_be_partial
8654          ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
8655                           + bias_for_lowest, lowest_vf) - 1
8656          : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
8657                            + bias_for_lowest, lowest_vf) - 1);
8658   if (loop->any_estimate)
8659     loop->nb_iterations_estimate
8660       = (final_iter_may_be_partial
8661          ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
8662                           assumed_vf) - 1
8663          : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
8664                            assumed_vf) - 1);
8665
8666   if (dump_enabled_p ())
8667     {
8668       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8669         {
8670           dump_printf_loc (MSG_NOTE, vect_location,
8671                            "LOOP VECTORIZED\n");
8672           if (loop->inner)
8673             dump_printf_loc (MSG_NOTE, vect_location,
8674                              "OUTER LOOP VECTORIZED\n");
8675           dump_printf (MSG_NOTE, "\n");
8676         }
8677       else
8678         {
8679           dump_printf_loc (MSG_NOTE, vect_location,
8680                            "LOOP EPILOGUE VECTORIZED (VS=");
8681           dump_dec (MSG_NOTE, current_vector_size);
8682           dump_printf (MSG_NOTE, ")\n");
8683         }
8684     }
8685
8686   /* Loops vectorized with a variable factor won't benefit from
8687      unrolling/peeling.  */
8688   if (!vf.is_constant ())
8689     {
8690       loop->unroll = 1;
8691       if (dump_enabled_p ())
8692         dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
8693                          " variable-length vectorization factor\n");
8694     }
8695   /* Free SLP instances here because otherwise stmt reference counting
8696      won't work.  */
8697   slp_instance instance;
8698   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
8699     vect_free_slp_instance (instance, true);
8700   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
8701   /* Clear-up safelen field since its value is invalid after vectorization
8702      since vectorized loop can have loop-carried dependencies.  */
8703   loop->safelen = 0;
8704
8705   /* Don't vectorize epilogue for epilogue.  */
8706   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8707     epilogue = NULL;
8708
8709   if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
8710     epilogue = NULL;
8711
8712   if (epilogue)
8713     {
8714       auto_vector_sizes vector_sizes;
8715       targetm.vectorize.autovectorize_vector_sizes (&vector_sizes, false);
8716       unsigned int next_size = 0;
8717
8718       /* Note LOOP_VINFO_NITERS_KNOWN_P and LOOP_VINFO_INT_NITERS work
8719          on niters already ajusted for the iterations of the prologue.  */
8720       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8721           && known_eq (vf, lowest_vf))
8722         {
8723           unsigned HOST_WIDE_INT eiters
8724             = (LOOP_VINFO_INT_NITERS (loop_vinfo)
8725                - LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
8726           eiters
8727             = eiters % lowest_vf + LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
8728           epilogue->nb_iterations_upper_bound = eiters - 1;
8729           epilogue->any_upper_bound = true;
8730
8731           unsigned int ratio;
8732           while (next_size < vector_sizes.length ()
8733                  && !(constant_multiple_p (current_vector_size,
8734                                            vector_sizes[next_size], &ratio)
8735                       && eiters >= lowest_vf / ratio))
8736             next_size += 1;
8737         }
8738       else
8739         while (next_size < vector_sizes.length ()
8740                && maybe_lt (current_vector_size, vector_sizes[next_size]))
8741           next_size += 1;
8742
8743       if (next_size == vector_sizes.length ())
8744         epilogue = NULL;
8745     }
8746
8747   if (epilogue)
8748     {
8749       epilogue->force_vectorize = loop->force_vectorize;
8750       epilogue->safelen = loop->safelen;
8751       epilogue->dont_vectorize = false;
8752
8753       /* We may need to if-convert epilogue to vectorize it.  */
8754       if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8755         tree_if_conversion (epilogue);
8756     }
8757
8758   return epilogue;
8759 }
8760
8761 /* The code below is trying to perform simple optimization - revert
8762    if-conversion for masked stores, i.e. if the mask of a store is zero
8763    do not perform it and all stored value producers also if possible.
8764    For example,
8765      for (i=0; i<n; i++)
8766        if (c[i])
8767         {
8768           p1[i] += 1;
8769           p2[i] = p3[i] +2;
8770         }
8771    this transformation will produce the following semi-hammock:
8772
8773    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
8774      {
8775        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
8776        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
8777        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
8778        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
8779        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
8780        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
8781      }
8782 */
8783
8784 void
8785 optimize_mask_stores (struct loop *loop)
8786 {
8787   basic_block *bbs = get_loop_body (loop);
8788   unsigned nbbs = loop->num_nodes;
8789   unsigned i;
8790   basic_block bb;
8791   struct loop *bb_loop;
8792   gimple_stmt_iterator gsi;
8793   gimple *stmt;
8794   auto_vec<gimple *> worklist;
8795   auto_purge_vect_location sentinel;
8796
8797   vect_location = find_loop_location (loop);
8798   /* Pick up all masked stores in loop if any.  */
8799   for (i = 0; i < nbbs; i++)
8800     {
8801       bb = bbs[i];
8802       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
8803            gsi_next (&gsi))
8804         {
8805           stmt = gsi_stmt (gsi);
8806           if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
8807             worklist.safe_push (stmt);
8808         }
8809     }
8810
8811   free (bbs);
8812   if (worklist.is_empty ())
8813     return;
8814
8815   /* Loop has masked stores.  */
8816   while (!worklist.is_empty ())
8817     {
8818       gimple *last, *last_store;
8819       edge e, efalse;
8820       tree mask;
8821       basic_block store_bb, join_bb;
8822       gimple_stmt_iterator gsi_to;
8823       tree vdef, new_vdef;
8824       gphi *phi;
8825       tree vectype;
8826       tree zero;
8827
8828       last = worklist.pop ();
8829       mask = gimple_call_arg (last, 2);
8830       bb = gimple_bb (last);
8831       /* Create then_bb and if-then structure in CFG, then_bb belongs to
8832          the same loop as if_bb.  It could be different to LOOP when two
8833          level loop-nest is vectorized and mask_store belongs to the inner
8834          one.  */
8835       e = split_block (bb, last);
8836       bb_loop = bb->loop_father;
8837       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
8838       join_bb = e->dest;
8839       store_bb = create_empty_bb (bb);
8840       add_bb_to_loop (store_bb, bb_loop);
8841       e->flags = EDGE_TRUE_VALUE;
8842       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
8843       /* Put STORE_BB to likely part.  */
8844       efalse->probability = profile_probability::unlikely ();
8845       store_bb->count = efalse->count ();
8846       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
8847       if (dom_info_available_p (CDI_DOMINATORS))
8848         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
8849       if (dump_enabled_p ())
8850         dump_printf_loc (MSG_NOTE, vect_location,
8851                          "Create new block %d to sink mask stores.",
8852                          store_bb->index);
8853       /* Create vector comparison with boolean result.  */
8854       vectype = TREE_TYPE (mask);
8855       zero = build_zero_cst (vectype);
8856       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
8857       gsi = gsi_last_bb (bb);
8858       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
8859       /* Create new PHI node for vdef of the last masked store:
8860          .MEM_2 = VDEF <.MEM_1>
8861          will be converted to
8862          .MEM.3 = VDEF <.MEM_1>
8863          and new PHI node will be created in join bb
8864          .MEM_2 = PHI <.MEM_1, .MEM_3>
8865       */
8866       vdef = gimple_vdef (last);
8867       new_vdef = make_ssa_name (gimple_vop (cfun), last);
8868       gimple_set_vdef (last, new_vdef);
8869       phi = create_phi_node (vdef, join_bb);
8870       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
8871
8872       /* Put all masked stores with the same mask to STORE_BB if possible.  */
8873       while (true)
8874         {
8875           gimple_stmt_iterator gsi_from;
8876           gimple *stmt1 = NULL;
8877
8878           /* Move masked store to STORE_BB.  */
8879           last_store = last;
8880           gsi = gsi_for_stmt (last);
8881           gsi_from = gsi;
8882           /* Shift GSI to the previous stmt for further traversal.  */
8883           gsi_prev (&gsi);
8884           gsi_to = gsi_start_bb (store_bb);
8885           gsi_move_before (&gsi_from, &gsi_to);
8886           /* Setup GSI_TO to the non-empty block start.  */
8887           gsi_to = gsi_start_bb (store_bb);
8888           if (dump_enabled_p ())
8889             dump_printf_loc (MSG_NOTE, vect_location,
8890                              "Move stmt to created bb\n%G", last);
8891           /* Move all stored value producers if possible.  */
8892           while (!gsi_end_p (gsi))
8893             {
8894               tree lhs;
8895               imm_use_iterator imm_iter;
8896               use_operand_p use_p;
8897               bool res;
8898
8899               /* Skip debug statements.  */
8900               if (is_gimple_debug (gsi_stmt (gsi)))
8901                 {
8902                   gsi_prev (&gsi);
8903                   continue;
8904                 }
8905               stmt1 = gsi_stmt (gsi);
8906               /* Do not consider statements writing to memory or having
8907                  volatile operand.  */
8908               if (gimple_vdef (stmt1)
8909                   || gimple_has_volatile_ops (stmt1))
8910                 break;
8911               gsi_from = gsi;
8912               gsi_prev (&gsi);
8913               lhs = gimple_get_lhs (stmt1);
8914               if (!lhs)
8915                 break;
8916
8917               /* LHS of vectorized stmt must be SSA_NAME.  */
8918               if (TREE_CODE (lhs) != SSA_NAME)
8919                 break;
8920
8921               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8922                 {
8923                   /* Remove dead scalar statement.  */
8924                   if (has_zero_uses (lhs))
8925                     {
8926                       gsi_remove (&gsi_from, true);
8927                       continue;
8928                     }
8929                 }
8930
8931               /* Check that LHS does not have uses outside of STORE_BB.  */
8932               res = true;
8933               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
8934                 {
8935                   gimple *use_stmt;
8936                   use_stmt = USE_STMT (use_p);
8937                   if (is_gimple_debug (use_stmt))
8938                     continue;
8939                   if (gimple_bb (use_stmt) != store_bb)
8940                     {
8941                       res = false;
8942                       break;
8943                     }
8944                 }
8945               if (!res)
8946                 break;
8947
8948               if (gimple_vuse (stmt1)
8949                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
8950                 break;
8951
8952               /* Can move STMT1 to STORE_BB.  */
8953               if (dump_enabled_p ())
8954                 dump_printf_loc (MSG_NOTE, vect_location,
8955                                  "Move stmt to created bb\n%G", stmt1);
8956               gsi_move_before (&gsi_from, &gsi_to);
8957               /* Shift GSI_TO for further insertion.  */
8958               gsi_prev (&gsi_to);
8959             }
8960           /* Put other masked stores with the same mask to STORE_BB.  */
8961           if (worklist.is_empty ()
8962               || gimple_call_arg (worklist.last (), 2) != mask
8963               || worklist.last () != stmt1)
8964             break;
8965           last = worklist.pop ();
8966         }
8967       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
8968     }
8969 }