gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2019 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "cfghooks.h"
  31 #include "tree-pass.h"
  32 #include "ssa.h"
  33 #include "optabs-tree.h"
  34 #include "diagnostic-core.h"
  35 #include "fold-const.h"
  36 #include "stor-layout.h"
  37 #include "cfganal.h"
  38 #include "gimplify.h"
  39 #include "gimple-iterator.h"
  40 #include "gimplify-me.h"
  41 #include "tree-ssa-loop-ivopts.h"
  42 #include "tree-ssa-loop-manip.h"
  43 #include "tree-ssa-loop-niter.h"
  44 #include "tree-ssa-loop.h"
  45 #include "cfgloop.h"
  46 #include "params.h"
  47 #include "tree-scalar-evolution.h"
  48 #include "tree-vectorizer.h"
  49 #include "gimple-fold.h"
  50 #include "cgraph.h"
  51 #include "tree-cfg.h"
  52 #include "tree-if-conv.h"
  53 #include "internal-fn.h"
  54 #include "tree-vector-builder.h"
  55 #include "vec-perm-indices.h"
  56 #include "tree-eh.h"
  57
  58 /* Loop Vectorization Pass.
  59
  60    This pass tries to vectorize loops.
  61
  62    For example, the vectorizer transforms the following simple loop:
  63
  64         short a[N]; short b[N]; short c[N]; int i;
  65
  66         for (i=0; i<N; i++){
  67           a[i] = b[i] + c[i];
  68         }
  69
  70    as if it was manually vectorized by rewriting the source code into:
  71
  72         typedef int __attribute__((mode(V8HI))) v8hi;
  73         short a[N];  short b[N]; short c[N];   int i;
  74         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  75         v8hi va, vb, vc;
  76
  77         for (i=0; i<N/8; i++){
  78           vb = pb[i];
  79           vc = pc[i];
  80           va = vb + vc;
  81           pa[i] = va;
  82         }
  83
  84         The main entry to this pass is vectorize_loops(), in which
  85    the vectorizer applies a set of analyses on a given set of loops,
  86    followed by the actual vectorization transformation for the loops that
  87    had successfully passed the analysis phase.
  88         Throughout this pass we make a distinction between two types of
  89    data: scalars (which are represented by SSA_NAMES), and memory references
  90    ("data-refs").  These two types of data require different handling both
  91    during analysis and transformation. The types of data-refs that the
  92    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  93    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  94    accesses are required to have a simple (consecutive) access pattern.
  95
  96    Analysis phase:
  97    ===============
  98         The driver for the analysis phase is vect_analyze_loop().
  99    It applies a set of analyses, some of which rely on the scalar evolution
 100    analyzer (scev) developed by Sebastian Pop.
 101
 102         During the analysis phase the vectorizer records some information
 103    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 104    loop, as well as general information about the loop as a whole, which is
 105    recorded in a "loop_vec_info" struct attached to each loop.
 106
 107    Transformation phase:
 108    =====================
 109         The loop transformation phase scans all the stmts in the loop, and
 110    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 111    the loop that needs to be vectorized.  It inserts the vector code sequence
 112    just before the scalar stmt S, and records a pointer to the vector code
 113    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 114    attached to S).  This pointer will be used for the vectorization of following
 115    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 116    otherwise, we rely on dead code elimination for removing it.
 117
 118         For example, say stmt S1 was vectorized into stmt VS1:
 119
 120    VS1: vb = px[i];
 121    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 122    S2:  a = b;
 123
 124    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 125    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 126    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 127    resulting sequence would be:
 128
 129    VS1: vb = px[i];
 130    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 131    VS2: va = vb;
 132    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 133
 134         Operands that are not SSA_NAMEs, are data-refs that appear in
 135    load/store operations (like 'x[i]' in S1), and are handled differently.
 136
 137    Target modeling:
 138    =================
 139         Currently the only target specific information that is used is the
 140    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 141    Targets that can support different sizes of vectors, for now will need
 142    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 143    flexibility will be added in the future.
 144
 145         Since we only vectorize operations which vector form can be
 146    expressed using existing tree codes, to verify that an operation is
 147    supported, the vectorizer checks the relevant optab at the relevant
 148    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 149    the value found is CODE_FOR_nothing, then there's no target support, and
 150    we can't vectorize the stmt.
 151
 152    For additional information on this project see:
 153    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 154 */
 155
 156 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 157
 158 /* Subroutine of vect_determine_vf_for_stmt that handles only one
 159    statement.  VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
 160    may already be set for general statements (not just data refs).  */
 161
 162 static opt_result
 163 vect_determine_vf_for_stmt_1 (stmt_vec_info stmt_info,
 164                               bool vectype_maybe_set_p,
 165                               poly_uint64 *vf,
 166                               vec<stmt_vec_info > *mask_producers)
 167 {
 168   gimple *stmt = stmt_info->stmt;
 169
 170   if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 171        && !STMT_VINFO_LIVE_P (stmt_info))
 172       || gimple_clobber_p (stmt))
 173     {
 174       if (dump_enabled_p ())
 175         dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 176       return opt_result::success ();
 177     }
 178
 179   tree stmt_vectype, nunits_vectype;
 180   opt_result res = vect_get_vector_types_for_stmt (stmt_info, &stmt_vectype,
 181                                                    &nunits_vectype);
 182   if (!res)
 183     return res;
 184
 185   if (stmt_vectype)
 186     {
 187       if (STMT_VINFO_VECTYPE (stmt_info))
 188         /* The only case when a vectype had been already set is for stmts
 189            that contain a data ref, or for "pattern-stmts" (stmts generated
 190            by the vectorizer to represent/replace a certain idiom).  */
 191         gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
 192                      || vectype_maybe_set_p)
 193                     && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
 194       else if (stmt_vectype == boolean_type_node)
 195         mask_producers->safe_push (stmt_info);
 196       else
 197         STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
 198     }
 199
 200   if (nunits_vectype)
 201     vect_update_max_nunits (vf, nunits_vectype);
 202
 203   return opt_result::success ();
 204 }
 205
 206 /* Subroutine of vect_determine_vectorization_factor.  Set the vector
 207    types of STMT_INFO and all attached pattern statements and update
 208    the vectorization factor VF accordingly.  If some of the statements
 209    produce a mask result whose vector type can only be calculated later,
 210    add them to MASK_PRODUCERS.  Return true on success or false if
 211    something prevented vectorization.  */
 212
 213 static opt_result
 214 vect_determine_vf_for_stmt (stmt_vec_info stmt_info, poly_uint64 *vf,
 215                             vec<stmt_vec_info > *mask_producers)
 216 {
 217   vec_info *vinfo = stmt_info->vinfo;
 218   if (dump_enabled_p ())
 219     dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
 220                      stmt_info->stmt);
 221   opt_result res
 222     = vect_determine_vf_for_stmt_1 (stmt_info, false, vf, mask_producers);
 223   if (!res)
 224     return res;
 225
 226   if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 227       && STMT_VINFO_RELATED_STMT (stmt_info))
 228     {
 229       gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 230       stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
 231
 232       /* If a pattern statement has def stmts, analyze them too.  */
 233       for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
 234            !gsi_end_p (si); gsi_next (&si))
 235         {
 236           stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
 237           if (dump_enabled_p ())
 238             dump_printf_loc (MSG_NOTE, vect_location,
 239                              "==> examining pattern def stmt: %G",
 240                              def_stmt_info->stmt);
 241           if (!vect_determine_vf_for_stmt_1 (def_stmt_info, true,
 242                                              vf, mask_producers))
 243           res = vect_determine_vf_for_stmt_1 (def_stmt_info, true,
 244                                               vf, mask_producers);
 245           if (!res)
 246             return res;
 247         }
 248
 249       if (dump_enabled_p ())
 250         dump_printf_loc (MSG_NOTE, vect_location,
 251                          "==> examining pattern statement: %G",
 252                          stmt_info->stmt);
 253       res = vect_determine_vf_for_stmt_1 (stmt_info, true, vf, mask_producers);
 254       if (!res)
 255         return res;
 256     }
 257
 258   return opt_result::success ();
 259 }
 260
 261 /* Function vect_determine_vectorization_factor
 262
 263    Determine the vectorization factor (VF).  VF is the number of data elements
 264    that are operated upon in parallel in a single iteration of the vectorized
 265    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 266    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 267    elements can fit in a single vector register.
 268
 269    We currently support vectorization of loops in which all types operated upon
 270    are of the same size.  Therefore this function currently sets VF according to
 271    the size of the types operated upon, and fails if there are multiple sizes
 272    in the loop.
 273
 274    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 275    original loop:
 276         for (i=0; i<N; i++){
 277           a[i] = b[i] + c[i];
 278         }
 279
 280    vectorized loop:
 281         for (i=0; i<N; i+=VF){
 282           a[i:VF] = b[i:VF] + c[i:VF];
 283         }
 284 */
 285
 286 static opt_result
 287 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 288 {
 289   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 290   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 291   unsigned nbbs = loop->num_nodes;
 292   poly_uint64 vectorization_factor = 1;
 293   tree scalar_type = NULL_TREE;
 294   gphi *phi;
 295   tree vectype;
 296   stmt_vec_info stmt_info;
 297   unsigned i;
 298   auto_vec<stmt_vec_info> mask_producers;
 299
 300   DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
 301
 302   for (i = 0; i < nbbs; i++)
 303     {
 304       basic_block bb = bbs[i];
 305
 306       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 307            gsi_next (&si))
 308         {
 309           phi = si.phi ();
 310           stmt_info = loop_vinfo->lookup_stmt (phi);
 311           if (dump_enabled_p ())
 312             dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
 313                              phi);
 314
 315           gcc_assert (stmt_info);
 316
 317           if (STMT_VINFO_RELEVANT_P (stmt_info)
 318               || STMT_VINFO_LIVE_P (stmt_info))
 319             {
 320               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 321               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 322
 323               if (dump_enabled_p ())
 324                 dump_printf_loc (MSG_NOTE, vect_location,
 325                                  "get vectype for scalar type:  %T\n",
 326                                  scalar_type);
 327
 328               vectype = get_vectype_for_scalar_type (scalar_type);
 329               if (!vectype)
 330                 return opt_result::failure_at (phi,
 331                                                "not vectorized: unsupported "
 332                                                "data-type %T\n",
 333                                                scalar_type);
 334               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 335
 336               if (dump_enabled_p ())
 337                 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
 338                                  vectype);
 339
 340               if (dump_enabled_p ())
 341                 {
 342                   dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
 343                   dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
 344                   dump_printf (MSG_NOTE, "\n");
 345                 }
 346
 347               vect_update_max_nunits (&vectorization_factor, vectype);
 348             }
 349         }
 350
 351       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
 352            gsi_next (&si))
 353         {
 354           stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
 355           opt_result res
 356             = vect_determine_vf_for_stmt (stmt_info, &vectorization_factor,
 357                                           &mask_producers);
 358           if (!res)
 359             return res;
 360         }
 361     }
 362
 363   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 364   if (dump_enabled_p ())
 365     {
 366       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
 367       dump_dec (MSG_NOTE, vectorization_factor);
 368       dump_printf (MSG_NOTE, "\n");
 369     }
 370
 371   if (known_le (vectorization_factor, 1U))
 372     return opt_result::failure_at (vect_location,
 373                                    "not vectorized: unsupported data-type\n");
 374   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 375
 376   for (i = 0; i < mask_producers.length (); i++)
 377     {
 378       stmt_info = mask_producers[i];
 379       opt_tree mask_type = vect_get_mask_type_for_stmt (stmt_info);
 380       if (!mask_type)
 381         return opt_result::propagate_failure (mask_type);
 382       STMT_VINFO_VECTYPE (stmt_info) = mask_type;
 383     }
 384
 385   return opt_result::success ();
 386 }
 387
 388
 389 /* Function vect_is_simple_iv_evolution.
 390
 391    FORNOW: A simple evolution of an induction variables in the loop is
 392    considered a polynomial evolution.  */
 393
 394 static bool
 395 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 396                              tree * step)
 397 {
 398   tree init_expr;
 399   tree step_expr;
 400   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 401   basic_block bb;
 402
 403   /* When there is no evolution in this loop, the evolution function
 404      is not "simple".  */
 405   if (evolution_part == NULL_TREE)
 406     return false;
 407
 408   /* When the evolution is a polynomial of degree >= 2
 409      the evolution function is not "simple".  */
 410   if (tree_is_chrec (evolution_part))
 411     return false;
 412
 413   step_expr = evolution_part;
 414   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 415
 416   if (dump_enabled_p ())
 417     dump_printf_loc (MSG_NOTE, vect_location, "step: %T,  init: %T\n",
 418                      step_expr, init_expr);
 419
 420   *init = init_expr;
 421   *step = step_expr;
 422
 423   if (TREE_CODE (step_expr) != INTEGER_CST
 424       && (TREE_CODE (step_expr) != SSA_NAME
 425           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 426               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 427           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 428               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 429                   || !flag_associative_math)))
 430       && (TREE_CODE (step_expr) != REAL_CST
 431           || !flag_associative_math))
 432     {
 433       if (dump_enabled_p ())
 434         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 435                          "step unknown.\n");
 436       return false;
 437     }
 438
 439   return true;
 440 }
 441
 442 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
 443    what we are assuming is a double reduction.  For example, given
 444    a structure like this:
 445
 446       outer1:
 447         x_1 = PHI <x_4(outer2), ...>;
 448         ...
 449
 450       inner:
 451         x_2 = PHI <x_1(outer1), ...>;
 452         ...
 453         x_3 = ...;
 454         ...
 455
 456       outer2:
 457         x_4 = PHI <x_3(inner)>;
 458         ...
 459
 460    outer loop analysis would treat x_1 as a double reduction phi and
 461    this function would then return true for x_2.  */
 462
 463 static bool
 464 vect_inner_phi_in_double_reduction_p (stmt_vec_info stmt_info, gphi *phi)
 465 {
 466   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
 467   use_operand_p use_p;
 468   ssa_op_iter op_iter;
 469   FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
 470     if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
 471       if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
 472         return true;
 473   return false;
 474 }
 475
 476 /* Function vect_analyze_scalar_cycles_1.
 477
 478    Examine the cross iteration def-use cycles of scalar variables
 479    in LOOP.  LOOP_VINFO represents the loop that is now being
 480    considered for vectorization (can be LOOP, or an outer-loop
 481    enclosing LOOP).  */
 482
 483 static void
 484 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 485 {
 486   basic_block bb = loop->header;
 487   tree init, step;
 488   auto_vec<stmt_vec_info, 64> worklist;
 489   gphi_iterator gsi;
 490   bool double_reduc;
 491
 492   DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
 493
 494   /* First - identify all inductions.  Reduction detection assumes that all the
 495      inductions have been identified, therefore, this order must not be
 496      changed.  */
 497   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 498     {
 499       gphi *phi = gsi.phi ();
 500       tree access_fn = NULL;
 501       tree def = PHI_RESULT (phi);
 502       stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
 503
 504       if (dump_enabled_p ())
 505         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
 506
 507       /* Skip virtual phi's.  The data dependences that are associated with
 508          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 509       if (virtual_operand_p (def))
 510         continue;
 511
 512       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 513
 514       /* Analyze the evolution function.  */
 515       access_fn = analyze_scalar_evolution (loop, def);
 516       if (access_fn)
 517         {
 518           STRIP_NOPS (access_fn);
 519           if (dump_enabled_p ())
 520             dump_printf_loc (MSG_NOTE, vect_location,
 521                              "Access function of PHI: %T\n", access_fn);
 522           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 523             = initial_condition_in_loop_num (access_fn, loop->num);
 524           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 525             = evolution_part_in_loop_num (access_fn, loop->num);
 526         }
 527
 528       if (!access_fn
 529           || vect_inner_phi_in_double_reduction_p (stmt_vinfo, phi)
 530           || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
 531           || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 532               && TREE_CODE (step) != INTEGER_CST))
 533         {
 534           worklist.safe_push (stmt_vinfo);
 535           continue;
 536         }
 537
 538       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 539                   != NULL_TREE);
 540       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 541
 542       if (dump_enabled_p ())
 543         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 544       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 545     }
 546
 547
 548   /* Second - identify all reductions and nested cycles.  */
 549   while (worklist.length () > 0)
 550     {
 551       stmt_vec_info stmt_vinfo = worklist.pop ();
 552       gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
 553       tree def = PHI_RESULT (phi);
 554
 555       if (dump_enabled_p ())
 556         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
 557
 558       gcc_assert (!virtual_operand_p (def)
 559                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 560
 561       stmt_vec_info reduc_stmt_info
 562         = vect_force_simple_reduction (loop_vinfo, stmt_vinfo,
 563                                        &double_reduc, false);
 564       if (reduc_stmt_info)
 565         {
 566           if (double_reduc)
 567             {
 568               if (dump_enabled_p ())
 569                 dump_printf_loc (MSG_NOTE, vect_location,
 570                                  "Detected double reduction.\n");
 571
 572               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 573               STMT_VINFO_DEF_TYPE (reduc_stmt_info)
 574                 = vect_double_reduction_def;
 575             }
 576           else
 577             {
 578               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
 579                 {
 580                   if (dump_enabled_p ())
 581                     dump_printf_loc (MSG_NOTE, vect_location,
 582                                      "Detected vectorizable nested cycle.\n");
 583
 584                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 585                   STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_nested_cycle;
 586                 }
 587               else
 588                 {
 589                   if (dump_enabled_p ())
 590                     dump_printf_loc (MSG_NOTE, vect_location,
 591                                      "Detected reduction.\n");
 592
 593                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 594                   STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
 595                   /* Store the reduction cycles for possible vectorization in
 596                      loop-aware SLP if it was not detected as reduction
 597                      chain.  */
 598                   if (! REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
 599                     LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
 600                       (reduc_stmt_info);
 601                 }
 602             }
 603         }
 604       else
 605         if (dump_enabled_p ())
 606           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 607                            "Unknown def-use cycle pattern.\n");
 608     }
 609 }
 610
 611
 612 /* Function vect_analyze_scalar_cycles.
 613
 614    Examine the cross iteration def-use cycles of scalar variables, by
 615    analyzing the loop-header PHIs of scalar variables.  Classify each
 616    cycle as one of the following: invariant, induction, reduction, unknown.
 617    We do that for the loop represented by LOOP_VINFO, and also to its
 618    inner-loop, if exists.
 619    Examples for scalar cycles:
 620
 621    Example1: reduction:
 622
 623               loop1:
 624               for (i=0; i<N; i++)
 625                  sum += a[i];
 626
 627    Example2: induction:
 628
 629               loop2:
 630               for (i=0; i<N; i++)
 631                  a[i] = i;  */
 632
 633 static void
 634 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 635 {
 636   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 637
 638   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 639
 640   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 641      Reductions in such inner-loop therefore have different properties than
 642      the reductions in the nest that gets vectorized:
 643      1. When vectorized, they are executed in the same order as in the original
 644         scalar loop, so we can't change the order of computation when
 645         vectorizing them.
 646      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 647         current checks are too strict.  */
 648
 649   if (loop->inner)
 650     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 651 }
 652
 653 /* Transfer group and reduction information from STMT_INFO to its
 654    pattern stmt.  */
 655
 656 static void
 657 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
 658 {
 659   stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
 660   stmt_vec_info stmtp;
 661   gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
 662               && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
 663   REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
 664   do
 665     {
 666       stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
 667       REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
 668       stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
 669       if (stmt_info)
 670         REDUC_GROUP_NEXT_ELEMENT (stmtp)
 671           = STMT_VINFO_RELATED_STMT (stmt_info);
 672     }
 673   while (stmt_info);
 674   STMT_VINFO_DEF_TYPE (stmtp) = vect_reduction_def;
 675 }
 676
 677 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 678
 679 static void
 680 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 681 {
 682   stmt_vec_info first;
 683   unsigned i;
 684
 685   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 686     if (STMT_VINFO_IN_PATTERN_P (first))
 687       {
 688         stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
 689         while (next)
 690           {
 691             if (! STMT_VINFO_IN_PATTERN_P (next))
 692               break;
 693             next = REDUC_GROUP_NEXT_ELEMENT (next);
 694           }
 695         /* If not all stmt in the chain are patterns try to handle
 696            the chain without patterns.  */
 697         if (! next)
 698           {
 699             vect_fixup_reduc_chain (first);
 700             LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
 701               = STMT_VINFO_RELATED_STMT (first);
 702           }
 703       }
 704 }
 705
 706 /* Function vect_get_loop_niters.
 707
 708    Determine how many iterations the loop is executed and place it
 709    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
 710    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
 711    niter information holds in ASSUMPTIONS.
 712
 713    Return the loop exit condition.  */
 714
 715
 716 static gcond *
 717 vect_get_loop_niters (struct loop *loop, tree *assumptions,
 718                       tree *number_of_iterations, tree *number_of_iterationsm1)
 719 {
 720   edge exit = single_exit (loop);
 721   struct tree_niter_desc niter_desc;
 722   tree niter_assumptions, niter, may_be_zero;
 723   gcond *cond = get_loop_exit_condition (loop);
 724
 725   *assumptions = boolean_true_node;
 726   *number_of_iterationsm1 = chrec_dont_know;
 727   *number_of_iterations = chrec_dont_know;
 728   DUMP_VECT_SCOPE ("get_loop_niters");
 729
 730   if (!exit)
 731     return cond;
 732
 733   niter = chrec_dont_know;
 734   may_be_zero = NULL_TREE;
 735   niter_assumptions = boolean_true_node;
 736   if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
 737       || chrec_contains_undetermined (niter_desc.niter))
 738     return cond;
 739
 740   niter_assumptions = niter_desc.assumptions;
 741   may_be_zero = niter_desc.may_be_zero;
 742   niter = niter_desc.niter;
 743
 744   if (may_be_zero && integer_zerop (may_be_zero))
 745     may_be_zero = NULL_TREE;
 746
 747   if (may_be_zero)
 748     {
 749       if (COMPARISON_CLASS_P (may_be_zero))
 750         {
 751           /* Try to combine may_be_zero with assumptions, this can simplify
 752              computation of niter expression.  */
 753           if (niter_assumptions && !integer_nonzerop (niter_assumptions))
 754             niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
 755                                              niter_assumptions,
 756                                              fold_build1 (TRUTH_NOT_EXPR,
 757                                                           boolean_type_node,
 758                                                           may_be_zero));
 759           else
 760             niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
 761                                  build_int_cst (TREE_TYPE (niter), 0),
 762                                  rewrite_to_non_trapping_overflow (niter));
 763
 764           may_be_zero = NULL_TREE;
 765         }
 766       else if (integer_nonzerop (may_be_zero))
 767         {
 768           *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
 769           *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
 770           return cond;
 771         }
 772       else
 773         return cond;
 774     }
 775
 776   *assumptions = niter_assumptions;
 777   *number_of_iterationsm1 = niter;
 778
 779   /* We want the number of loop header executions which is the number
 780      of latch executions plus one.
 781      ???  For UINT_MAX latch executions this number overflows to zero
 782      for loops like do { n++; } while (n != 0);  */
 783   if (niter && !chrec_contains_undetermined (niter))
 784     niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
 785                           build_int_cst (TREE_TYPE (niter), 1));
 786   *number_of_iterations = niter;
 787
 788   return cond;
 789 }
 790
 791 /* Function bb_in_loop_p
 792
 793    Used as predicate for dfs order traversal of the loop bbs.  */
 794
 795 static bool
 796 bb_in_loop_p (const_basic_block bb, const void *data)
 797 {
 798   const struct loop *const loop = (const struct loop *)data;
 799   if (flow_bb_inside_loop_p (loop, bb))
 800     return true;
 801   return false;
 802 }
 803
 804
 805 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
 806    stmt_vec_info structs for all the stmts in LOOP_IN.  */
 807
 808 _loop_vec_info::_loop_vec_info (struct loop *loop_in, vec_info_shared *shared)
 809   : vec_info (vec_info::loop, init_cost (loop_in), shared),
 810     loop (loop_in),
 811     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
 812     num_itersm1 (NULL_TREE),
 813     num_iters (NULL_TREE),
 814     num_iters_unchanged (NULL_TREE),
 815     num_iters_assumptions (NULL_TREE),
 816     th (0),
 817     versioning_threshold (0),
 818     vectorization_factor (0),
 819     max_vectorization_factor (0),
 820     mask_skip_niters (NULL_TREE),
 821     mask_compare_type (NULL_TREE),
 822     simd_if_cond (NULL_TREE),
 823     unaligned_dr (NULL),
 824     peeling_for_alignment (0),
 825     ptr_mask (0),
 826     ivexpr_map (NULL),
 827     slp_unrolling_factor (1),
 828     single_scalar_iteration_cost (0),
 829     vectorizable (false),
 830     can_fully_mask_p (true),
 831     fully_masked_p (false),
 832     peeling_for_gaps (false),
 833     peeling_for_niter (false),
 834     operands_swapped (false),
 835     no_data_dependencies (false),
 836     has_mask_store (false),
 837     scalar_loop (NULL),
 838     orig_loop_info (NULL)
 839 {
 840   /* CHECKME: We want to visit all BBs before their successors (except for
 841      latch blocks, for which this assertion wouldn't hold).  In the simple
 842      case of the loop forms we allow, a dfs order of the BBs would the same
 843      as reversed postorder traversal, so we are safe.  */
 844
 845   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 846                                           bbs, loop->num_nodes, loop);
 847   gcc_assert (nbbs == loop->num_nodes);
 848
 849   for (unsigned int i = 0; i < nbbs; i++)
 850     {
 851       basic_block bb = bbs[i];
 852       gimple_stmt_iterator si;
 853
 854       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 855         {
 856           gimple *phi = gsi_stmt (si);
 857           gimple_set_uid (phi, 0);
 858           add_stmt (phi);
 859         }
 860
 861       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 862         {
 863           gimple *stmt = gsi_stmt (si);
 864           gimple_set_uid (stmt, 0);
 865           add_stmt (stmt);
 866           /* If .GOMP_SIMD_LANE call for the current loop has 2 arguments, the
 867              second argument is the #pragma omp simd if (x) condition, when 0,
 868              loop shouldn't be vectorized, when non-zero constant, it should
 869              be vectorized normally, otherwise versioned with vectorized loop
 870              done if the condition is non-zero at runtime.  */
 871           if (loop_in->simduid
 872               && is_gimple_call (stmt)
 873               && gimple_call_internal_p (stmt)
 874               && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
 875               && gimple_call_num_args (stmt) >= 2
 876               && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
 877               && (loop_in->simduid
 878                   == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
 879             {
 880               tree arg = gimple_call_arg (stmt, 1);
 881               if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
 882                 simd_if_cond = arg;
 883               else
 884                 gcc_assert (integer_nonzerop (arg));
 885             }
 886         }
 887     }
 888 }
 889
 890 /* Free all levels of MASKS.  */
 891
 892 void
 893 release_vec_loop_masks (vec_loop_masks *masks)
 894 {
 895   rgroup_masks *rgm;
 896   unsigned int i;
 897   FOR_EACH_VEC_ELT (*masks, i, rgm)
 898     rgm->masks.release ();
 899   masks->release ();
 900 }
 901
 902 /* Free all memory used by the _loop_vec_info, as well as all the
 903    stmt_vec_info structs of all the stmts in the loop.  */
 904
 905 _loop_vec_info::~_loop_vec_info ()
 906 {
 907   int nbbs;
 908   gimple_stmt_iterator si;
 909   int j;
 910
 911   nbbs = loop->num_nodes;
 912   for (j = 0; j < nbbs; j++)
 913     {
 914       basic_block bb = bbs[j];
 915       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
 916         {
 917           gimple *stmt = gsi_stmt (si);
 918
 919           /* We may have broken canonical form by moving a constant
 920              into RHS1 of a commutative op.  Fix such occurrences.  */
 921           if (operands_swapped && is_gimple_assign (stmt))
 922             {
 923               enum tree_code code = gimple_assign_rhs_code (stmt);
 924
 925               if ((code == PLUS_EXPR
 926                    || code == POINTER_PLUS_EXPR
 927                    || code == MULT_EXPR)
 928                   && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
 929                 swap_ssa_operands (stmt,
 930                                    gimple_assign_rhs1_ptr (stmt),
 931                                    gimple_assign_rhs2_ptr (stmt));
 932               else if (code == COND_EXPR
 933                        && CONSTANT_CLASS_P (gimple_assign_rhs2 (stmt)))
 934                 {
 935                   tree cond_expr = gimple_assign_rhs1 (stmt);
 936                   enum tree_code cond_code = TREE_CODE (cond_expr);
 937
 938                   if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
 939                     {
 940                       bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr,
 941                                                                   0));
 942                       cond_code = invert_tree_comparison (cond_code,
 943                                                           honor_nans);
 944                       if (cond_code != ERROR_MARK)
 945                         {
 946                           TREE_SET_CODE (cond_expr, cond_code);
 947                           swap_ssa_operands (stmt,
 948                                              gimple_assign_rhs2_ptr (stmt),
 949                                              gimple_assign_rhs3_ptr (stmt));
 950                         }
 951                     }
 952                 }
 953             }
 954           gsi_next (&si);
 955         }
 956     }
 957
 958   free (bbs);
 959
 960   release_vec_loop_masks (&masks);
 961   delete ivexpr_map;
 962
 963   loop->aux = NULL;
 964 }
 965
 966 /* Return an invariant or register for EXPR and emit necessary
 967    computations in the LOOP_VINFO loop preheader.  */
 968
 969 tree
 970 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
 971 {
 972   if (is_gimple_reg (expr)
 973       || is_gimple_min_invariant (expr))
 974     return expr;
 975
 976   if (! loop_vinfo->ivexpr_map)
 977     loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
 978   tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
 979   if (! cached)
 980     {
 981       gimple_seq stmts = NULL;
 982       cached = force_gimple_operand (unshare_expr (expr),
 983                                      &stmts, true, NULL_TREE);
 984       if (stmts)
 985         {
 986           edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
 987           gsi_insert_seq_on_edge_immediate (e, stmts);
 988         }
 989     }
 990   return cached;
 991 }
 992
 993 /* Return true if we can use CMP_TYPE as the comparison type to produce
 994    all masks required to mask LOOP_VINFO.  */
 995
 996 static bool
 997 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
 998 {
 999   rgroup_masks *rgm;
1000   unsigned int i;
1001   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1002     if (rgm->mask_type != NULL_TREE
1003         && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
1004                                             cmp_type, rgm->mask_type,
1005                                             OPTIMIZE_FOR_SPEED))
1006       return false;
1007   return true;
1008 }
1009
1010 /* Calculate the maximum number of scalars per iteration for every
1011    rgroup in LOOP_VINFO.  */
1012
1013 static unsigned int
1014 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1015 {
1016   unsigned int res = 1;
1017   unsigned int i;
1018   rgroup_masks *rgm;
1019   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1020     res = MAX (res, rgm->max_nscalars_per_iter);
1021   return res;
1022 }
1023
1024 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1025    whether we can actually generate the masks required.  Return true if so,
1026    storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE.  */
1027
1028 static bool
1029 vect_verify_full_masking (loop_vec_info loop_vinfo)
1030 {
1031   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1032   unsigned int min_ni_width;
1033
1034   /* Use a normal loop if there are no statements that need masking.
1035      This only happens in rare degenerate cases: it means that the loop
1036      has no loads, no stores, and no live-out values.  */
1037   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1038     return false;
1039
1040   /* Get the maximum number of iterations that is representable
1041      in the counter type.  */
1042   tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1043   widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1044
1045   /* Get a more refined estimate for the number of iterations.  */
1046   widest_int max_back_edges;
1047   if (max_loop_iterations (loop, &max_back_edges))
1048     max_ni = wi::smin (max_ni, max_back_edges + 1);
1049
1050   /* Account for rgroup masks, in which each bit is replicated N times.  */
1051   max_ni *= vect_get_max_nscalars_per_iter (loop_vinfo);
1052
1053   /* Work out how many bits we need to represent the limit.  */
1054   min_ni_width = wi::min_precision (max_ni, UNSIGNED);
1055
1056   /* Find a scalar mode for which WHILE_ULT is supported.  */
1057   opt_scalar_int_mode cmp_mode_iter;
1058   tree cmp_type = NULL_TREE;
1059   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1060     {
1061       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1062       if (cmp_bits >= min_ni_width
1063           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1064         {
1065           tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1066           if (this_type
1067               && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1068             {
1069               /* Although we could stop as soon as we find a valid mode,
1070                  it's often better to continue until we hit Pmode, since the
1071                  operands to the WHILE are more likely to be reusable in
1072                  address calculations.  */
1073               cmp_type = this_type;
1074               if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1075                 break;
1076             }
1077         }
1078     }
1079
1080   if (!cmp_type)
1081     return false;
1082
1083   LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
1084   return true;
1085 }
1086
1087 /* Calculate the cost of one scalar iteration of the loop.  */
1088 static void
1089 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1090 {
1091   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1092   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1093   int nbbs = loop->num_nodes, factor;
1094   int innerloop_iters, i;
1095
1096   DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1097
1098   /* Gather costs for statements in the scalar loop.  */
1099
1100   /* FORNOW.  */
1101   innerloop_iters = 1;
1102   if (loop->inner)
1103     innerloop_iters = 50; /* FIXME */
1104
1105   for (i = 0; i < nbbs; i++)
1106     {
1107       gimple_stmt_iterator si;
1108       basic_block bb = bbs[i];
1109
1110       if (bb->loop_father == loop->inner)
1111         factor = innerloop_iters;
1112       else
1113         factor = 1;
1114
1115       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1116         {
1117           gimple *stmt = gsi_stmt (si);
1118           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1119
1120           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1121             continue;
1122
1123           /* Skip stmts that are not vectorized inside the loop.  */
1124           stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1125           if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1126               && (!STMT_VINFO_LIVE_P (vstmt_info)
1127                   || !VECTORIZABLE_CYCLE_DEF
1128                         (STMT_VINFO_DEF_TYPE (vstmt_info))))
1129             continue;
1130
1131           vect_cost_for_stmt kind;
1132           if (STMT_VINFO_DATA_REF (stmt_info))
1133             {
1134               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1135                kind = scalar_load;
1136              else
1137                kind = scalar_store;
1138             }
1139           else
1140             kind = scalar_stmt;
1141
1142           record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1143                             factor, kind, stmt_info, 0, vect_prologue);
1144         }
1145     }
1146
1147   /* Now accumulate cost.  */
1148   void *target_cost_data = init_cost (loop);
1149   stmt_info_for_cost *si;
1150   int j;
1151   FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1152                     j, si)
1153     (void) add_stmt_cost (target_cost_data, si->count,
1154                           si->kind, si->stmt_info, si->misalign,
1155                           vect_body);
1156   unsigned dummy, body_cost = 0;
1157   finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
1158   destroy_cost_data (target_cost_data);
1159   LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
1160 }
1161
1162
1163 /* Function vect_analyze_loop_form_1.
1164
1165    Verify that certain CFG restrictions hold, including:
1166    - the loop has a pre-header
1167    - the loop has a single entry and exit
1168    - the loop exit condition is simple enough
1169    - the number of iterations can be analyzed, i.e, a countable loop.  The
1170      niter could be analyzed under some assumptions.  */
1171
1172 opt_result
1173 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1174                           tree *assumptions, tree *number_of_iterationsm1,
1175                           tree *number_of_iterations, gcond **inner_loop_cond)
1176 {
1177   DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1178
1179   /* Different restrictions apply when we are considering an inner-most loop,
1180      vs. an outer (nested) loop.
1181      (FORNOW. May want to relax some of these restrictions in the future).  */
1182
1183   if (!loop->inner)
1184     {
1185       /* Inner-most loop.  We currently require that the number of BBs is
1186          exactly 2 (the header and latch).  Vectorizable inner-most loops
1187          look like this:
1188
1189                         (pre-header)
1190                            |
1191                           header <--------+
1192                            | |            |
1193                            | +--> latch --+
1194                            |
1195                         (exit-bb)  */
1196
1197       if (loop->num_nodes != 2)
1198         return opt_result::failure_at (vect_location,
1199                                        "not vectorized:"
1200                                        " control flow in loop.\n");
1201
1202       if (empty_block_p (loop->header))
1203         return opt_result::failure_at (vect_location,
1204                                        "not vectorized: empty loop.\n");
1205     }
1206   else
1207     {
1208       struct loop *innerloop = loop->inner;
1209       edge entryedge;
1210
1211       /* Nested loop. We currently require that the loop is doubly-nested,
1212          contains a single inner loop, and the number of BBs is exactly 5.
1213          Vectorizable outer-loops look like this:
1214
1215                         (pre-header)
1216                            |
1217                           header <---+
1218                            |         |
1219                           inner-loop |
1220                            |         |
1221                           tail ------+
1222                            |
1223                         (exit-bb)
1224
1225          The inner-loop has the properties expected of inner-most loops
1226          as described above.  */
1227
1228       if ((loop->inner)->inner || (loop->inner)->next)
1229         return opt_result::failure_at (vect_location,
1230                                        "not vectorized:"
1231                                        " multiple nested loops.\n");
1232
1233       if (loop->num_nodes != 5)
1234         return opt_result::failure_at (vect_location,
1235                                        "not vectorized:"
1236                                        " control flow in loop.\n");
1237
1238       entryedge = loop_preheader_edge (innerloop);
1239       if (entryedge->src != loop->header
1240           || !single_exit (innerloop)
1241           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1242         return opt_result::failure_at (vect_location,
1243                                        "not vectorized:"
1244                                        " unsupported outerloop form.\n");
1245
1246       /* Analyze the inner-loop.  */
1247       tree inner_niterm1, inner_niter, inner_assumptions;
1248       opt_result res
1249         = vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1250                                     &inner_assumptions, &inner_niterm1,
1251                                     &inner_niter, NULL);
1252       if (!res)
1253         {
1254           if (dump_enabled_p ())
1255             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1256                              "not vectorized: Bad inner loop.\n");
1257           return res;
1258         }
1259
1260       /* Don't support analyzing niter under assumptions for inner
1261          loop.  */
1262       if (!integer_onep (inner_assumptions))
1263         return opt_result::failure_at (vect_location,
1264                                        "not vectorized: Bad inner loop.\n");
1265
1266       if (!expr_invariant_in_loop_p (loop, inner_niter))
1267         return opt_result::failure_at (vect_location,
1268                                        "not vectorized: inner-loop count not"
1269                                        " invariant.\n");
1270
1271       if (dump_enabled_p ())
1272         dump_printf_loc (MSG_NOTE, vect_location,
1273                          "Considering outer-loop vectorization.\n");
1274     }
1275
1276   if (!single_exit (loop))
1277     return opt_result::failure_at (vect_location,
1278                                    "not vectorized: multiple exits.\n");
1279   if (EDGE_COUNT (loop->header->preds) != 2)
1280     return opt_result::failure_at (vect_location,
1281                                    "not vectorized:"
1282                                    " too many incoming edges.\n");
1283
1284   /* We assume that the loop exit condition is at the end of the loop. i.e,
1285      that the loop is represented as a do-while (with a proper if-guard
1286      before the loop if needed), where the loop header contains all the
1287      executable statements, and the latch is empty.  */
1288   if (!empty_block_p (loop->latch)
1289       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1290     return opt_result::failure_at (vect_location,
1291                                    "not vectorized: latch block not empty.\n");
1292
1293   /* Make sure the exit is not abnormal.  */
1294   edge e = single_exit (loop);
1295   if (e->flags & EDGE_ABNORMAL)
1296     return opt_result::failure_at (vect_location,
1297                                    "not vectorized:"
1298                                    " abnormal loop exit edge.\n");
1299
1300   *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1301                                      number_of_iterationsm1);
1302   if (!*loop_cond)
1303     return opt_result::failure_at
1304       (vect_location,
1305        "not vectorized: complicated exit condition.\n");
1306
1307   if (integer_zerop (*assumptions)
1308       || !*number_of_iterations
1309       || chrec_contains_undetermined (*number_of_iterations))
1310     return opt_result::failure_at
1311       (*loop_cond,
1312        "not vectorized: number of iterations cannot be computed.\n");
1313
1314   if (integer_zerop (*number_of_iterations))
1315     return opt_result::failure_at
1316       (*loop_cond,
1317        "not vectorized: number of iterations = 0.\n");
1318
1319   return opt_result::success ();
1320 }
1321
1322 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form.  */
1323
1324 opt_loop_vec_info
1325 vect_analyze_loop_form (struct loop *loop, vec_info_shared *shared)
1326 {
1327   tree assumptions, number_of_iterations, number_of_iterationsm1;
1328   gcond *loop_cond, *inner_loop_cond = NULL;
1329
1330   opt_result res
1331     = vect_analyze_loop_form_1 (loop, &loop_cond,
1332                                 &assumptions, &number_of_iterationsm1,
1333                                 &number_of_iterations, &inner_loop_cond);
1334   if (!res)
1335     return opt_loop_vec_info::propagate_failure (res);
1336
1337   loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1338   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1339   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1340   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1341   if (!integer_onep (assumptions))
1342     {
1343       /* We consider to vectorize this loop by versioning it under
1344          some assumptions.  In order to do this, we need to clear
1345          existing information computed by scev and niter analyzer.  */
1346       scev_reset_htab ();
1347       free_numbers_of_iterations_estimates (loop);
1348       /* Also set flag for this loop so that following scev and niter
1349          analysis are done under the assumptions.  */
1350       loop_constraint_set (loop, LOOP_C_FINITE);
1351       /* Also record the assumptions for versioning.  */
1352       LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1353     }
1354
1355   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1356     {
1357       if (dump_enabled_p ())
1358         {
1359           dump_printf_loc (MSG_NOTE, vect_location,
1360                            "Symbolic number of iterations is ");
1361           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1362           dump_printf (MSG_NOTE, "\n");
1363         }
1364     }
1365
1366   stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (loop_cond);
1367   STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1368   if (inner_loop_cond)
1369     {
1370       stmt_vec_info inner_loop_cond_info
1371         = loop_vinfo->lookup_stmt (inner_loop_cond);
1372       STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1373     }
1374
1375   gcc_assert (!loop->aux);
1376   loop->aux = loop_vinfo;
1377   return opt_loop_vec_info::success (loop_vinfo);
1378 }
1379
1380
1381
1382 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1383    statements update the vectorization factor.  */
1384
1385 static void
1386 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1387 {
1388   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1389   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1390   int nbbs = loop->num_nodes;
1391   poly_uint64 vectorization_factor;
1392   int i;
1393
1394   DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1395
1396   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1397   gcc_assert (known_ne (vectorization_factor, 0U));
1398
1399   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1400      vectorization factor of the loop is the unrolling factor required by
1401      the SLP instances.  If that unrolling factor is 1, we say, that we
1402      perform pure SLP on loop - cross iteration parallelism is not
1403      exploited.  */
1404   bool only_slp_in_loop = true;
1405   for (i = 0; i < nbbs; i++)
1406     {
1407       basic_block bb = bbs[i];
1408       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1409            gsi_next (&si))
1410         {
1411           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1412           stmt_info = vect_stmt_to_vectorize (stmt_info);
1413           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1414                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1415               && !PURE_SLP_STMT (stmt_info))
1416             /* STMT needs both SLP and loop-based vectorization.  */
1417             only_slp_in_loop = false;
1418         }
1419     }
1420
1421   if (only_slp_in_loop)
1422     {
1423       if (dump_enabled_p ())
1424         dump_printf_loc (MSG_NOTE, vect_location,
1425                          "Loop contains only SLP stmts\n");
1426       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1427     }
1428   else
1429     {
1430       if (dump_enabled_p ())
1431         dump_printf_loc (MSG_NOTE, vect_location,
1432                          "Loop contains SLP and non-SLP stmts\n");
1433       /* Both the vectorization factor and unroll factor have the form
1434          current_vector_size * X for some rational X, so they must have
1435          a common multiple.  */
1436       vectorization_factor
1437         = force_common_multiple (vectorization_factor,
1438                                  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1439     }
1440
1441   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1442   if (dump_enabled_p ())
1443     {
1444       dump_printf_loc (MSG_NOTE, vect_location,
1445                        "Updating vectorization factor to ");
1446       dump_dec (MSG_NOTE, vectorization_factor);
1447       dump_printf (MSG_NOTE, ".\n");
1448     }
1449 }
1450
1451 /* Return true if STMT_INFO describes a double reduction phi and if
1452    the other phi in the reduction is also relevant for vectorization.
1453    This rejects cases such as:
1454
1455       outer1:
1456         x_1 = PHI <x_3(outer2), ...>;
1457         ...
1458
1459       inner:
1460         x_2 = ...;
1461         ...
1462
1463       outer2:
1464         x_3 = PHI <x_2(inner)>;
1465
1466    if nothing in x_2 or elsewhere makes x_1 relevant.  */
1467
1468 static bool
1469 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1470 {
1471   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1472     return false;
1473
1474   return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1475 }
1476
1477 /* Function vect_analyze_loop_operations.
1478
1479    Scan the loop stmts and make sure they are all vectorizable.  */
1480
1481 static opt_result
1482 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1483 {
1484   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1485   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1486   int nbbs = loop->num_nodes;
1487   int i;
1488   stmt_vec_info stmt_info;
1489   bool need_to_vectorize = false;
1490   bool ok;
1491
1492   DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1493
1494   auto_vec<stmt_info_for_cost> cost_vec;
1495
1496   for (i = 0; i < nbbs; i++)
1497     {
1498       basic_block bb = bbs[i];
1499
1500       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1501            gsi_next (&si))
1502         {
1503           gphi *phi = si.phi ();
1504           ok = true;
1505
1506           stmt_info = loop_vinfo->lookup_stmt (phi);
1507           if (dump_enabled_p ())
1508             dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G", phi);
1509           if (virtual_operand_p (gimple_phi_result (phi)))
1510             continue;
1511
1512           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1513              (i.e., a phi in the tail of the outer-loop).  */
1514           if (! is_loop_header_bb_p (bb))
1515             {
1516               /* FORNOW: we currently don't support the case that these phis
1517                  are not used in the outerloop (unless it is double reduction,
1518                  i.e., this phi is vect_reduction_def), cause this case
1519                  requires to actually do something here.  */
1520               if (STMT_VINFO_LIVE_P (stmt_info)
1521                   && !vect_active_double_reduction_p (stmt_info))
1522                 return opt_result::failure_at (phi,
1523                                                "Unsupported loop-closed phi"
1524                                                " in outer-loop.\n");
1525
1526               /* If PHI is used in the outer loop, we check that its operand
1527                  is defined in the inner loop.  */
1528               if (STMT_VINFO_RELEVANT_P (stmt_info))
1529                 {
1530                   tree phi_op;
1531
1532                   if (gimple_phi_num_args (phi) != 1)
1533                     return opt_result::failure_at (phi, "unsupported phi");
1534
1535                   phi_op = PHI_ARG_DEF (phi, 0);
1536                   stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1537                   if (!op_def_info)
1538                     return opt_result::failure_at (phi, "unsupported phi");
1539
1540                   if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1541                       && (STMT_VINFO_RELEVANT (op_def_info)
1542                           != vect_used_in_outer_by_reduction))
1543                     return opt_result::failure_at (phi, "unsupported phi");
1544                 }
1545
1546               continue;
1547             }
1548
1549           gcc_assert (stmt_info);
1550
1551           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1552                || STMT_VINFO_LIVE_P (stmt_info))
1553               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1554             /* A scalar-dependence cycle that we don't support.  */
1555             return opt_result::failure_at (phi,
1556                                            "not vectorized:"
1557                                            " scalar dependence cycle.\n");
1558
1559           if (STMT_VINFO_RELEVANT_P (stmt_info))
1560             {
1561               need_to_vectorize = true;
1562               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1563                   && ! PURE_SLP_STMT (stmt_info))
1564                 ok = vectorizable_induction (stmt_info, NULL, NULL, NULL,
1565                                              &cost_vec);
1566               else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1567                         || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1568                        && ! PURE_SLP_STMT (stmt_info))
1569                 ok = vectorizable_reduction (stmt_info, NULL, NULL, NULL, NULL,
1570                                              &cost_vec);
1571             }
1572
1573           /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
1574           if (ok
1575               && STMT_VINFO_LIVE_P (stmt_info)
1576               && !PURE_SLP_STMT (stmt_info))
1577             ok = vectorizable_live_operation (stmt_info, NULL, NULL, -1, NULL,
1578                                               &cost_vec);
1579
1580           if (!ok)
1581             return opt_result::failure_at (phi,
1582                                            "not vectorized: relevant phi not "
1583                                            "supported: %G",
1584                                            static_cast <gimple *> (phi));
1585         }
1586
1587       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1588            gsi_next (&si))
1589         {
1590           gimple *stmt = gsi_stmt (si);
1591           if (!gimple_clobber_p (stmt))
1592             {
1593               opt_result res
1594                 = vect_analyze_stmt (loop_vinfo->lookup_stmt (stmt),
1595                                      &need_to_vectorize,
1596                                      NULL, NULL, &cost_vec);
1597               if (!res)
1598                 return res;
1599             }
1600         }
1601     } /* bbs */
1602
1603   add_stmt_costs (loop_vinfo->target_cost_data, &cost_vec);
1604
1605   /* All operations in the loop are either irrelevant (deal with loop
1606      control, or dead), or only used outside the loop and can be moved
1607      out of the loop (e.g. invariants, inductions).  The loop can be
1608      optimized away by scalar optimizations.  We're better off not
1609      touching this loop.  */
1610   if (!need_to_vectorize)
1611     {
1612       if (dump_enabled_p ())
1613         dump_printf_loc (MSG_NOTE, vect_location,
1614                          "All the computation can be taken out of the loop.\n");
1615       return opt_result::failure_at
1616         (vect_location,
1617          "not vectorized: redundant loop. no profit to vectorize.\n");
1618     }
1619
1620   return opt_result::success ();
1621 }
1622
1623 /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
1624    is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
1625    definitely no, or -1 if it's worth retrying.  */
1626
1627 static int
1628 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1629 {
1630   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1631   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1632
1633   /* Only fully-masked loops can have iteration counts less than the
1634      vectorization factor.  */
1635   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1636     {
1637       HOST_WIDE_INT max_niter;
1638
1639       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1640         max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1641       else
1642         max_niter = max_stmt_executions_int (loop);
1643
1644       if (max_niter != -1
1645           && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1646         {
1647           if (dump_enabled_p ())
1648             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1649                              "not vectorized: iteration count smaller than "
1650                              "vectorization factor.\n");
1651           return 0;
1652         }
1653     }
1654
1655   int min_profitable_iters, min_profitable_estimate;
1656   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1657                                       &min_profitable_estimate);
1658
1659   if (min_profitable_iters < 0)
1660     {
1661       if (dump_enabled_p ())
1662         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1663                          "not vectorized: vectorization not profitable.\n");
1664       if (dump_enabled_p ())
1665         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1666                          "not vectorized: vector version will never be "
1667                          "profitable.\n");
1668       return -1;
1669     }
1670
1671   int min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1672                                * assumed_vf);
1673
1674   /* Use the cost model only if it is more conservative than user specified
1675      threshold.  */
1676   unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1677                                     min_profitable_iters);
1678
1679   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1680
1681   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1682       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1683     {
1684       if (dump_enabled_p ())
1685         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1686                          "not vectorized: vectorization not profitable.\n");
1687       if (dump_enabled_p ())
1688         dump_printf_loc (MSG_NOTE, vect_location,
1689                          "not vectorized: iteration count smaller than user "
1690                          "specified loop bound parameter or minimum profitable "
1691                          "iterations (whichever is more conservative).\n");
1692       return 0;
1693     }
1694
1695   HOST_WIDE_INT estimated_niter = estimated_stmt_executions_int (loop);
1696   if (estimated_niter == -1)
1697     estimated_niter = likely_max_stmt_executions_int (loop);
1698   if (estimated_niter != -1
1699       && ((unsigned HOST_WIDE_INT) estimated_niter
1700           < MAX (th, (unsigned) min_profitable_estimate)))
1701     {
1702       if (dump_enabled_p ())
1703         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1704                          "not vectorized: estimated iteration count too "
1705                          "small.\n");
1706       if (dump_enabled_p ())
1707         dump_printf_loc (MSG_NOTE, vect_location,
1708                          "not vectorized: estimated iteration count smaller "
1709                          "than specified loop bound parameter or minimum "
1710                          "profitable iterations (whichever is more "
1711                          "conservative).\n");
1712       return -1;
1713     }
1714
1715   return 1;
1716 }
1717
1718 static opt_result
1719 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1720                            vec<data_reference_p> *datarefs,
1721                            unsigned int *n_stmts)
1722 {
1723   *n_stmts = 0;
1724   for (unsigned i = 0; i < loop->num_nodes; i++)
1725     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1726          !gsi_end_p (gsi); gsi_next (&gsi))
1727       {
1728         gimple *stmt = gsi_stmt (gsi);
1729         if (is_gimple_debug (stmt))
1730           continue;
1731         ++(*n_stmts);
1732         opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs);
1733         if (!res)
1734           {
1735             if (is_gimple_call (stmt) && loop->safelen)
1736               {
1737                 tree fndecl = gimple_call_fndecl (stmt), op;
1738                 if (fndecl != NULL_TREE)
1739                   {
1740                     cgraph_node *node = cgraph_node::get (fndecl);
1741                     if (node != NULL && node->simd_clones != NULL)
1742                       {
1743                         unsigned int j, n = gimple_call_num_args (stmt);
1744                         for (j = 0; j < n; j++)
1745                           {
1746                             op = gimple_call_arg (stmt, j);
1747                             if (DECL_P (op)
1748                                 || (REFERENCE_CLASS_P (op)
1749                                     && get_base_address (op)))
1750                               break;
1751                           }
1752                         op = gimple_call_lhs (stmt);
1753                         /* Ignore #pragma omp declare simd functions
1754                            if they don't have data references in the
1755                            call stmt itself.  */
1756                         if (j == n
1757                             && !(op
1758                                  && (DECL_P (op)
1759                                      || (REFERENCE_CLASS_P (op)
1760                                          && get_base_address (op)))))
1761                           continue;
1762                       }
1763                   }
1764               }
1765             return res;
1766           }
1767         /* If dependence analysis will give up due to the limit on the
1768            number of datarefs stop here and fail fatally.  */
1769         if (datarefs->length ()
1770             > (unsigned)PARAM_VALUE (PARAM_LOOP_MAX_DATAREFS_FOR_DATADEPS))
1771           return opt_result::failure_at (stmt, "exceeded param "
1772                                          "loop-max-datarefs-for-datadeps\n");
1773       }
1774   return opt_result::success ();
1775 }
1776
1777 /* Look for SLP-only access groups and turn each individual access into its own
1778    group.  */
1779 static void
1780 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
1781 {
1782   unsigned int i;
1783   struct data_reference *dr;
1784
1785   DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
1786
1787   vec<data_reference_p> datarefs = loop_vinfo->shared->datarefs;
1788   FOR_EACH_VEC_ELT (datarefs, i, dr)
1789     {
1790       gcc_assert (DR_REF (dr));
1791       stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
1792
1793       /* Check if the load is a part of an interleaving chain.  */
1794       if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1795         {
1796           stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
1797           unsigned int group_size = DR_GROUP_SIZE (first_element);
1798
1799           /* Check if SLP-only groups.  */
1800           if (!STMT_SLP_TYPE (stmt_info)
1801               && STMT_VINFO_SLP_VECT_ONLY (first_element))
1802             {
1803               /* Dissolve the group.  */
1804               STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
1805
1806               stmt_vec_info vinfo = first_element;
1807               while (vinfo)
1808                 {
1809                   stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
1810                   DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
1811                   DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
1812                   DR_GROUP_SIZE (vinfo) = 1;
1813                   DR_GROUP_GAP (vinfo) = group_size - 1;
1814                   vinfo = next;
1815                 }
1816             }
1817         }
1818     }
1819 }
1820
1821 /* Function vect_analyze_loop_2.
1822
1823    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1824    for it.  The different analyses will record information in the
1825    loop_vec_info struct.  */
1826 static opt_result
1827 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
1828 {
1829   opt_result ok = opt_result::success ();
1830   int res;
1831   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
1832   poly_uint64 min_vf = 2;
1833
1834   /* The first group of checks is independent of the vector size.  */
1835   fatal = true;
1836
1837   if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
1838       && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
1839     return opt_result::failure_at (vect_location,
1840                                    "not vectorized: simd if(0)\n");
1841
1842   /* Find all data references in the loop (which correspond to vdefs/vuses)
1843      and analyze their evolution in the loop.  */
1844
1845   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1846
1847   /* Gather the data references and count stmts in the loop.  */
1848   if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
1849     {
1850       opt_result res
1851         = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
1852                                      &LOOP_VINFO_DATAREFS (loop_vinfo),
1853                                      n_stmts);
1854       if (!res)
1855         {
1856           if (dump_enabled_p ())
1857             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1858                              "not vectorized: loop contains function "
1859                              "calls or data references that cannot "
1860                              "be analyzed\n");
1861           return res;
1862         }
1863       loop_vinfo->shared->save_datarefs ();
1864     }
1865   else
1866     loop_vinfo->shared->check_datarefs ();
1867
1868   /* Analyze the data references and also adjust the minimal
1869      vectorization factor according to the loads and stores.  */
1870
1871   ok = vect_analyze_data_refs (loop_vinfo, &min_vf);
1872   if (!ok)
1873     {
1874       if (dump_enabled_p ())
1875         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1876                          "bad data references.\n");
1877       return ok;
1878     }
1879
1880   /* Classify all cross-iteration scalar data-flow cycles.
1881      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1882   vect_analyze_scalar_cycles (loop_vinfo);
1883
1884   vect_pattern_recog (loop_vinfo);
1885
1886   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1887
1888   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1889      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1890
1891   ok = vect_analyze_data_ref_accesses (loop_vinfo);
1892   if (!ok)
1893     {
1894       if (dump_enabled_p ())
1895         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1896                          "bad data access.\n");
1897       return ok;
1898     }
1899
1900   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1901
1902   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1903   if (!ok)
1904     {
1905       if (dump_enabled_p ())
1906         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1907                          "unexpected pattern.\n");
1908       return ok;
1909     }
1910
1911   /* While the rest of the analysis below depends on it in some way.  */
1912   fatal = false;
1913
1914   /* Analyze data dependences between the data-refs in the loop
1915      and adjust the maximum vectorization factor according to
1916      the dependences.
1917      FORNOW: fail at the first data dependence that we encounter.  */
1918
1919   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1920   if (!ok)
1921     {
1922       if (dump_enabled_p ())
1923         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1924                          "bad data dependence.\n");
1925       return ok;
1926     }
1927   if (max_vf != MAX_VECTORIZATION_FACTOR
1928       && maybe_lt (max_vf, min_vf))
1929     return opt_result::failure_at (vect_location, "bad data dependence.\n");
1930   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
1931
1932   ok = vect_determine_vectorization_factor (loop_vinfo);
1933   if (!ok)
1934     {
1935       if (dump_enabled_p ())
1936         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1937                          "can't determine vectorization factor.\n");
1938       return ok;
1939     }
1940   if (max_vf != MAX_VECTORIZATION_FACTOR
1941       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1942     return opt_result::failure_at (vect_location, "bad data dependence.\n");
1943
1944   /* Compute the scalar iteration cost.  */
1945   vect_compute_single_scalar_iteration_cost (loop_vinfo);
1946
1947   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1948   unsigned th;
1949
1950   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
1951   ok = vect_analyze_slp (loop_vinfo, *n_stmts);
1952   if (!ok)
1953     return ok;
1954
1955   /* If there are any SLP instances mark them as pure_slp.  */
1956   bool slp = vect_make_slp_decision (loop_vinfo);
1957   if (slp)
1958     {
1959       /* Find stmts that need to be both vectorized and SLPed.  */
1960       vect_detect_hybrid_slp (loop_vinfo);
1961
1962       /* Update the vectorization factor based on the SLP decision.  */
1963       vect_update_vf_for_slp (loop_vinfo);
1964     }
1965
1966   bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo);
1967
1968   /* We don't expect to have to roll back to anything other than an empty
1969      set of rgroups.  */
1970   gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
1971
1972   /* This is the point where we can re-start analysis with SLP forced off.  */
1973 start_over:
1974
1975   /* Now the vectorization factor is final.  */
1976   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1977   gcc_assert (known_ne (vectorization_factor, 0U));
1978
1979   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1980     {
1981       dump_printf_loc (MSG_NOTE, vect_location,
1982                        "vectorization_factor = ");
1983       dump_dec (MSG_NOTE, vectorization_factor);
1984       dump_printf (MSG_NOTE, ", niters = %wd\n",
1985                    LOOP_VINFO_INT_NITERS (loop_vinfo));
1986     }
1987
1988   HOST_WIDE_INT max_niter
1989     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1990
1991   /* Analyze the alignment of the data-refs in the loop.
1992      Fail if a data reference is found that cannot be vectorized.  */
1993
1994   ok = vect_analyze_data_refs_alignment (loop_vinfo);
1995   if (!ok)
1996     {
1997       if (dump_enabled_p ())
1998         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1999                          "bad data alignment.\n");
2000       return ok;
2001     }
2002
2003   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2004      It is important to call pruning after vect_analyze_data_ref_accesses,
2005      since we use grouping information gathered by interleaving analysis.  */
2006   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2007   if (!ok)
2008     return ok;
2009
2010   /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2011      vectorization, since we do not want to add extra peeling or
2012      add versioning for alignment.  */
2013   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2014     /* This pass will decide on using loop versioning and/or loop peeling in
2015        order to enhance the alignment of data references in the loop.  */
2016     ok = vect_enhance_data_refs_alignment (loop_vinfo);
2017   else
2018     ok = vect_verify_datarefs_alignment (loop_vinfo);
2019   if (!ok)
2020     return ok;
2021
2022   if (slp)
2023     {
2024       /* Analyze operations in the SLP instances.  Note this may
2025          remove unsupported SLP instances which makes the above
2026          SLP kind detection invalid.  */
2027       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2028       vect_slp_analyze_operations (loop_vinfo);
2029       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2030         {
2031           ok = opt_result::failure_at (vect_location,
2032                                        "unsupported SLP instances\n");
2033           goto again;
2034         }
2035     }
2036
2037   /* Dissolve SLP-only groups.  */
2038   vect_dissolve_slp_only_groups (loop_vinfo);
2039
2040   /* Scan all the remaining operations in the loop that are not subject
2041      to SLP and make sure they are vectorizable.  */
2042   ok = vect_analyze_loop_operations (loop_vinfo);
2043   if (!ok)
2044     {
2045       if (dump_enabled_p ())
2046         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2047                          "bad operation or unsupported loop bound.\n");
2048       return ok;
2049     }
2050
2051   /* Decide whether to use a fully-masked loop for this vectorization
2052      factor.  */
2053   LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2054     = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
2055        && vect_verify_full_masking (loop_vinfo));
2056   if (dump_enabled_p ())
2057     {
2058       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2059         dump_printf_loc (MSG_NOTE, vect_location,
2060                          "using a fully-masked loop.\n");
2061       else
2062         dump_printf_loc (MSG_NOTE, vect_location,
2063                          "not using a fully-masked loop.\n");
2064     }
2065
2066   /* If epilog loop is required because of data accesses with gaps,
2067      one additional iteration needs to be peeled.  Check if there is
2068      enough iterations for vectorization.  */
2069   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2070       && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2071       && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2072     {
2073       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2074       tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2075
2076       if (known_lt (wi::to_widest (scalar_niters), vf))
2077         return opt_result::failure_at (vect_location,
2078                                        "loop has no enough iterations to"
2079                                        " support peeling for gaps.\n");
2080     }
2081
2082   /* Check the costings of the loop make vectorizing worthwhile.  */
2083   res = vect_analyze_loop_costing (loop_vinfo);
2084   if (res < 0)
2085     {
2086       ok = opt_result::failure_at (vect_location,
2087                                    "Loop costings may not be worthwhile.\n");
2088       goto again;
2089     }
2090   if (!res)
2091     return opt_result::failure_at (vect_location,
2092                                    "Loop costings not worthwhile.\n");
2093
2094   /* Decide whether we need to create an epilogue loop to handle
2095      remaining scalar iterations.  */
2096   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2097
2098   unsigned HOST_WIDE_INT const_vf;
2099   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2100     /* The main loop handles all iterations.  */
2101     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2102   else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2103            && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
2104     {
2105       /* Work out the (constant) number of iterations that need to be
2106          peeled for reasons other than niters.  */
2107       unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2108       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2109         peel_niter += 1;
2110       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
2111                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2112         LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2113     }
2114   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2115            /* ??? When peeling for gaps but not alignment, we could
2116               try to check whether the (variable) niters is known to be
2117               VF * N + 1.  That's something of a niche case though.  */
2118            || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2119            || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
2120            || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2121                 < (unsigned) exact_log2 (const_vf))
2122                /* In case of versioning, check if the maximum number of
2123                   iterations is greater than th.  If they are identical,
2124                   the epilogue is unnecessary.  */
2125                && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2126                    || ((unsigned HOST_WIDE_INT) max_niter
2127                        > (th / const_vf) * const_vf))))
2128     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2129
2130   /* If an epilogue loop is required make sure we can create one.  */
2131   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2132       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2133     {
2134       if (dump_enabled_p ())
2135         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2136       if (!vect_can_advance_ivs_p (loop_vinfo)
2137           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2138                                            single_exit (LOOP_VINFO_LOOP
2139                                                          (loop_vinfo))))
2140         {
2141           ok = opt_result::failure_at (vect_location,
2142                                        "not vectorized: can't create required "
2143                                        "epilog loop\n");
2144           goto again;
2145         }
2146     }
2147
2148   /* During peeling, we need to check if number of loop iterations is
2149      enough for both peeled prolog loop and vector loop.  This check
2150      can be merged along with threshold check of loop versioning, so
2151      increase threshold for this case if necessary.  */
2152   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
2153     {
2154       poly_uint64 niters_th = 0;
2155
2156       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2157         {
2158           /* Niters for peeled prolog loop.  */
2159           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2160             {
2161               dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2162               tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2163               niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2164             }
2165           else
2166             niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2167         }
2168
2169       /* Niters for at least one iteration of vectorized loop.  */
2170       if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2171         niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2172       /* One additional iteration because of peeling for gap.  */
2173       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2174         niters_th += 1;
2175       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2176     }
2177
2178   gcc_assert (known_eq (vectorization_factor,
2179                         LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2180
2181   /* Ok to vectorize!  */
2182   return opt_result::success ();
2183
2184 again:
2185   /* Ensure that "ok" is false (with an opt_problem if dumping is enabled).  */
2186   gcc_assert (!ok);
2187
2188   /* Try again with SLP forced off but if we didn't do any SLP there is
2189      no point in re-trying.  */
2190   if (!slp)
2191     return ok;
2192
2193   /* If there are reduction chains re-trying will fail anyway.  */
2194   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2195     return ok;
2196
2197   /* Likewise if the grouped loads or stores in the SLP cannot be handled
2198      via interleaving or lane instructions.  */
2199   slp_instance instance;
2200   slp_tree node;
2201   unsigned i, j;
2202   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2203     {
2204       stmt_vec_info vinfo;
2205       vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2206       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2207         continue;
2208       vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2209       unsigned int size = DR_GROUP_SIZE (vinfo);
2210       tree vectype = STMT_VINFO_VECTYPE (vinfo);
2211       if (! vect_store_lanes_supported (vectype, size, false)
2212          && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2213          && ! vect_grouped_store_supported (vectype, size))
2214         return opt_result::failure_at (vinfo->stmt,
2215                                        "unsupported grouped store\n");
2216       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2217         {
2218           vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
2219           vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2220           bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2221           size = DR_GROUP_SIZE (vinfo);
2222           vectype = STMT_VINFO_VECTYPE (vinfo);
2223           if (! vect_load_lanes_supported (vectype, size, false)
2224               && ! vect_grouped_load_supported (vectype, single_element_p,
2225                                                 size))
2226             return opt_result::failure_at (vinfo->stmt,
2227                                            "unsupported grouped load\n");
2228         }
2229     }
2230
2231   if (dump_enabled_p ())
2232     dump_printf_loc (MSG_NOTE, vect_location,
2233                      "re-trying with SLP disabled\n");
2234
2235   /* Roll back state appropriately.  No SLP this time.  */
2236   slp = false;
2237   /* Restore vectorization factor as it were without SLP.  */
2238   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2239   /* Free the SLP instances.  */
2240   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2241     vect_free_slp_instance (instance, false);
2242   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2243   /* Reset SLP type to loop_vect on all stmts.  */
2244   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2245     {
2246       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2247       for (gimple_stmt_iterator si = gsi_start_phis (bb);
2248            !gsi_end_p (si); gsi_next (&si))
2249         {
2250           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2251           STMT_SLP_TYPE (stmt_info) = loop_vect;
2252         }
2253       for (gimple_stmt_iterator si = gsi_start_bb (bb);
2254            !gsi_end_p (si); gsi_next (&si))
2255         {
2256           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2257           STMT_SLP_TYPE (stmt_info) = loop_vect;
2258           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2259             {
2260               gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2261               stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
2262               STMT_SLP_TYPE (stmt_info) = loop_vect;
2263               for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2264                    !gsi_end_p (pi); gsi_next (&pi))
2265                 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2266                   = loop_vect;
2267             }
2268         }
2269     }
2270   /* Free optimized alias test DDRS.  */
2271   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2272   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2273   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2274   /* Reset target cost data.  */
2275   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2276   LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2277     = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2278   /* Reset accumulated rgroup information.  */
2279   release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo));
2280   /* Reset assorted flags.  */
2281   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2282   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2283   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2284   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2285   LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p;
2286
2287   goto start_over;
2288 }
2289
2290 /* Function vect_analyze_loop.
2291
2292    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2293    for it.  The different analyses will record information in the
2294    loop_vec_info struct.  If ORIG_LOOP_VINFO is not NULL epilogue must
2295    be vectorized.  */
2296 opt_loop_vec_info
2297 vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo,
2298                    vec_info_shared *shared)
2299 {
2300   auto_vector_sizes vector_sizes;
2301
2302   /* Autodetect first vector size we try.  */
2303   current_vector_size = 0;
2304   targetm.vectorize.autovectorize_vector_sizes (&vector_sizes,
2305                                                 loop->simdlen != 0);
2306   unsigned int next_size = 0;
2307
2308   DUMP_VECT_SCOPE ("analyze_loop_nest");
2309
2310   if (loop_outer (loop)
2311       && loop_vec_info_for_loop (loop_outer (loop))
2312       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2313     return opt_loop_vec_info::failure_at (vect_location,
2314                                           "outer-loop already vectorized.\n");
2315
2316   if (!find_loop_nest (loop, &shared->loop_nest))
2317     return opt_loop_vec_info::failure_at
2318       (vect_location,
2319        "not vectorized: loop nest containing two or more consecutive inner"
2320        " loops cannot be vectorized\n");
2321
2322   unsigned n_stmts = 0;
2323   poly_uint64 autodetected_vector_size = 0;
2324   opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
2325   poly_uint64 first_vector_size = 0;
2326   while (1)
2327     {
2328       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
2329       opt_loop_vec_info loop_vinfo
2330         = vect_analyze_loop_form (loop, shared);
2331       if (!loop_vinfo)
2332         {
2333           if (dump_enabled_p ())
2334             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2335                              "bad loop form.\n");
2336           gcc_checking_assert (first_loop_vinfo == NULL);
2337           return loop_vinfo;
2338         }
2339
2340       bool fatal = false;
2341
2342       if (orig_loop_vinfo)
2343         LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo;
2344
2345       opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts);
2346       if (res)
2347         {
2348           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2349
2350           if (loop->simdlen
2351               && maybe_ne (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2352                            (unsigned HOST_WIDE_INT) loop->simdlen))
2353             {
2354               if (first_loop_vinfo == NULL)
2355                 {
2356                   first_loop_vinfo = loop_vinfo;
2357                   first_vector_size = current_vector_size;
2358                   loop->aux = NULL;
2359                 }
2360               else
2361                 delete loop_vinfo;
2362             }
2363           else
2364             {
2365               delete first_loop_vinfo;
2366               return loop_vinfo;
2367             }
2368         }
2369       else
2370         delete loop_vinfo;
2371
2372       if (next_size == 0)
2373         autodetected_vector_size = current_vector_size;
2374
2375       if (next_size < vector_sizes.length ()
2376           && known_eq (vector_sizes[next_size], autodetected_vector_size))
2377         next_size += 1;
2378
2379       if (fatal)
2380         {
2381           gcc_checking_assert (first_loop_vinfo == NULL);
2382           return opt_loop_vec_info::propagate_failure (res);
2383         }
2384
2385       if (next_size == vector_sizes.length ()
2386           || known_eq (current_vector_size, 0U))
2387         {
2388           if (first_loop_vinfo)
2389             {
2390               current_vector_size = first_vector_size;
2391               loop->aux = (loop_vec_info) first_loop_vinfo;
2392               if (dump_enabled_p ())
2393                 {
2394                   dump_printf_loc (MSG_NOTE, vect_location,
2395                                    "***** Choosing vector size ");
2396                   dump_dec (MSG_NOTE, current_vector_size);
2397                   dump_printf (MSG_NOTE, "\n");
2398                 }
2399               return first_loop_vinfo;
2400             }
2401           else
2402             return opt_loop_vec_info::propagate_failure (res);
2403         }
2404
2405       /* Try the next biggest vector size.  */
2406       current_vector_size = vector_sizes[next_size++];
2407       if (dump_enabled_p ())
2408         {
2409           dump_printf_loc (MSG_NOTE, vect_location,
2410                            "***** Re-trying analysis with "
2411                            "vector size ");
2412           dump_dec (MSG_NOTE, current_vector_size);
2413           dump_printf (MSG_NOTE, "\n");
2414         }
2415     }
2416 }
2417
2418 /* Return true if there is an in-order reduction function for CODE, storing
2419    it in *REDUC_FN if so.  */
2420
2421 static bool
2422 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
2423 {
2424   switch (code)
2425     {
2426     case PLUS_EXPR:
2427       *reduc_fn = IFN_FOLD_LEFT_PLUS;
2428       return true;
2429
2430     default:
2431       return false;
2432     }
2433 }
2434
2435 /* Function reduction_fn_for_scalar_code
2436
2437    Input:
2438    CODE - tree_code of a reduction operations.
2439
2440    Output:
2441    REDUC_FN - the corresponding internal function to be used to reduce the
2442       vector of partial results into a single scalar result, or IFN_LAST
2443       if the operation is a supported reduction operation, but does not have
2444       such an internal function.
2445
2446    Return FALSE if CODE currently cannot be vectorized as reduction.  */
2447
2448 static bool
2449 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
2450 {
2451   switch (code)
2452     {
2453       case MAX_EXPR:
2454         *reduc_fn = IFN_REDUC_MAX;
2455         return true;
2456
2457       case MIN_EXPR:
2458         *reduc_fn = IFN_REDUC_MIN;
2459         return true;
2460
2461       case PLUS_EXPR:
2462         *reduc_fn = IFN_REDUC_PLUS;
2463         return true;
2464
2465       case BIT_AND_EXPR:
2466         *reduc_fn = IFN_REDUC_AND;
2467         return true;
2468
2469       case BIT_IOR_EXPR:
2470         *reduc_fn = IFN_REDUC_IOR;
2471         return true;
2472
2473       case BIT_XOR_EXPR:
2474         *reduc_fn = IFN_REDUC_XOR;
2475         return true;
2476
2477       case MULT_EXPR:
2478       case MINUS_EXPR:
2479         *reduc_fn = IFN_LAST;
2480         return true;
2481
2482       default:
2483        return false;
2484     }
2485 }
2486
2487 /* If there is a neutral value X such that SLP reduction NODE would not
2488    be affected by the introduction of additional X elements, return that X,
2489    otherwise return null.  CODE is the code of the reduction.  REDUC_CHAIN
2490    is true if the SLP statements perform a single reduction, false if each
2491    statement performs an independent reduction.  */
2492
2493 static tree
2494 neutral_op_for_slp_reduction (slp_tree slp_node, tree_code code,
2495                               bool reduc_chain)
2496 {
2497   vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2498   stmt_vec_info stmt_vinfo = stmts[0];
2499   tree vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
2500   tree scalar_type = TREE_TYPE (vector_type);
2501   struct loop *loop = gimple_bb (stmt_vinfo->stmt)->loop_father;
2502   gcc_assert (loop);
2503
2504   switch (code)
2505     {
2506     case WIDEN_SUM_EXPR:
2507     case DOT_PROD_EXPR:
2508     case SAD_EXPR:
2509     case PLUS_EXPR:
2510     case MINUS_EXPR:
2511     case BIT_IOR_EXPR:
2512     case BIT_XOR_EXPR:
2513       return build_zero_cst (scalar_type);
2514
2515     case MULT_EXPR:
2516       return build_one_cst (scalar_type);
2517
2518     case BIT_AND_EXPR:
2519       return build_all_ones_cst (scalar_type);
2520
2521     case MAX_EXPR:
2522     case MIN_EXPR:
2523       /* For MIN/MAX the initial values are neutral.  A reduction chain
2524          has only a single initial value, so that value is neutral for
2525          all statements.  */
2526       if (reduc_chain)
2527         return PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt,
2528                                       loop_preheader_edge (loop));
2529       return NULL_TREE;
2530
2531     default:
2532       return NULL_TREE;
2533     }
2534 }
2535
2536 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
2537    STMT is printed with a message MSG. */
2538
2539 static void
2540 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2541 {
2542   dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
2543 }
2544
2545 /* DEF_STMT_INFO occurs in a loop that contains a potential reduction
2546    operation.  Return true if the results of DEF_STMT_INFO are something
2547    that can be accumulated by such a reduction.  */
2548
2549 static bool
2550 vect_valid_reduction_input_p (stmt_vec_info def_stmt_info)
2551 {
2552   return (is_gimple_assign (def_stmt_info->stmt)
2553           || is_gimple_call (def_stmt_info->stmt)
2554           || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_induction_def
2555           || (gimple_code (def_stmt_info->stmt) == GIMPLE_PHI
2556               && STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_internal_def
2557               && !is_loop_header_bb_p (gimple_bb (def_stmt_info->stmt))));
2558 }
2559
2560 /* Detect SLP reduction of the form:
2561
2562    #a1 = phi <a5, a0>
2563    a2 = operation (a1)
2564    a3 = operation (a2)
2565    a4 = operation (a3)
2566    a5 = operation (a4)
2567
2568    #a = phi <a5>
2569
2570    PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2571    FIRST_STMT is the first reduction stmt in the chain
2572    (a2 = operation (a1)).
2573
2574    Return TRUE if a reduction chain was detected.  */
2575
2576 static bool
2577 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2578                        gimple *first_stmt)
2579 {
2580   struct loop *loop = (gimple_bb (phi))->loop_father;
2581   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2582   enum tree_code code;
2583   gimple *loop_use_stmt = NULL;
2584   stmt_vec_info use_stmt_info;
2585   tree lhs;
2586   imm_use_iterator imm_iter;
2587   use_operand_p use_p;
2588   int nloop_uses, size = 0, n_out_of_loop_uses;
2589   bool found = false;
2590
2591   if (loop != vect_loop)
2592     return false;
2593
2594   auto_vec<stmt_vec_info, 8> reduc_chain;
2595   lhs = PHI_RESULT (phi);
2596   code = gimple_assign_rhs_code (first_stmt);
2597   while (1)
2598     {
2599       nloop_uses = 0;
2600       n_out_of_loop_uses = 0;
2601       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2602         {
2603           gimple *use_stmt = USE_STMT (use_p);
2604           if (is_gimple_debug (use_stmt))
2605             continue;
2606
2607           /* Check if we got back to the reduction phi.  */
2608           if (use_stmt == phi)
2609             {
2610               loop_use_stmt = use_stmt;
2611               found = true;
2612               break;
2613             }
2614
2615           if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2616             {
2617               loop_use_stmt = use_stmt;
2618               nloop_uses++;
2619             }
2620            else
2621              n_out_of_loop_uses++;
2622
2623            /* There are can be either a single use in the loop or two uses in
2624               phi nodes.  */
2625            if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2626              return false;
2627         }
2628
2629       if (found)
2630         break;
2631
2632       /* We reached a statement with no loop uses.  */
2633       if (nloop_uses == 0)
2634         return false;
2635
2636       /* This is a loop exit phi, and we haven't reached the reduction phi.  */
2637       if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2638         return false;
2639
2640       if (!is_gimple_assign (loop_use_stmt)
2641           || code != gimple_assign_rhs_code (loop_use_stmt)
2642           || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2643         return false;
2644
2645       /* Insert USE_STMT into reduction chain.  */
2646       use_stmt_info = loop_info->lookup_stmt (loop_use_stmt);
2647       reduc_chain.safe_push (use_stmt_info);
2648
2649       lhs = gimple_assign_lhs (loop_use_stmt);
2650       size++;
2651    }
2652
2653   if (!found || loop_use_stmt != phi || size < 2)
2654     return false;
2655
2656   /* Swap the operands, if needed, to make the reduction operand be the second
2657      operand.  */
2658   lhs = PHI_RESULT (phi);
2659   for (unsigned i = 0; i < reduc_chain.length (); ++i)
2660     {
2661       gassign *next_stmt = as_a <gassign *> (reduc_chain[i]->stmt);
2662       if (gimple_assign_rhs2 (next_stmt) == lhs)
2663         {
2664           tree op = gimple_assign_rhs1 (next_stmt);
2665           stmt_vec_info def_stmt_info = loop_info->lookup_def (op);
2666
2667           /* Check that the other def is either defined in the loop
2668              ("vect_internal_def"), or it's an induction (defined by a
2669              loop-header phi-node).  */
2670           if (def_stmt_info
2671               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt))
2672               && vect_valid_reduction_input_p (def_stmt_info))
2673             {
2674               lhs = gimple_assign_lhs (next_stmt);
2675               continue;
2676             }
2677
2678           return false;
2679         }
2680       else
2681         {
2682           tree op = gimple_assign_rhs2 (next_stmt);
2683           stmt_vec_info def_stmt_info = loop_info->lookup_def (op);
2684
2685           /* Check that the other def is either defined in the loop
2686             ("vect_internal_def"), or it's an induction (defined by a
2687             loop-header phi-node).  */
2688           if (def_stmt_info
2689               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt))
2690               && vect_valid_reduction_input_p (def_stmt_info))
2691             {
2692               if (dump_enabled_p ())
2693                 dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: %G",
2694                                  next_stmt);
2695
2696               swap_ssa_operands (next_stmt,
2697                                  gimple_assign_rhs1_ptr (next_stmt),
2698                                  gimple_assign_rhs2_ptr (next_stmt));
2699               update_stmt (next_stmt);
2700
2701               if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2702                 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2703             }
2704           else
2705             return false;
2706         }
2707
2708       lhs = gimple_assign_lhs (next_stmt);
2709     }
2710
2711   /* Build up the actual chain.  */
2712   for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
2713     {
2714       REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
2715       REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
2716     }
2717   REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
2718   REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
2719
2720   /* Save the chain for further analysis in SLP detection.  */
2721   LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
2722   REDUC_GROUP_SIZE (reduc_chain[0]) = size;
2723
2724   return true;
2725 }
2726
2727 /* Return true if we need an in-order reduction for operation CODE
2728    on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
2729    overflow must wrap.  */
2730
2731 static bool
2732 needs_fold_left_reduction_p (tree type, tree_code code,
2733                              bool need_wrapping_integral_overflow)
2734 {
2735   /* CHECKME: check for !flag_finite_math_only too?  */
2736   if (SCALAR_FLOAT_TYPE_P (type))
2737     switch (code)
2738       {
2739       case MIN_EXPR:
2740       case MAX_EXPR:
2741         return false;
2742
2743       default:
2744         return !flag_associative_math;
2745       }
2746
2747   if (INTEGRAL_TYPE_P (type))
2748     {
2749       if (!operation_no_trapping_overflow (type, code))
2750         return true;
2751       if (need_wrapping_integral_overflow
2752           && !TYPE_OVERFLOW_WRAPS (type)
2753           && operation_can_overflow (code))
2754         return true;
2755       return false;
2756     }
2757
2758   if (SAT_FIXED_POINT_TYPE_P (type))
2759     return true;
2760
2761   return false;
2762 }
2763
2764 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
2765    reduction operation CODE has a handled computation expression.  */
2766
2767 bool
2768 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
2769                       tree loop_arg, enum tree_code code)
2770 {
2771   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
2772   auto_bitmap visited;
2773   tree lookfor = PHI_RESULT (phi);
2774   ssa_op_iter curri;
2775   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
2776   while (USE_FROM_PTR (curr) != loop_arg)
2777     curr = op_iter_next_use (&curri);
2778   curri.i = curri.numops;
2779   do
2780     {
2781       path.safe_push (std::make_pair (curri, curr));
2782       tree use = USE_FROM_PTR (curr);
2783       if (use == lookfor)
2784         break;
2785       gimple *def = SSA_NAME_DEF_STMT (use);
2786       if (gimple_nop_p (def)
2787           || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
2788         {
2789 pop:
2790           do
2791             {
2792               std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
2793               curri = x.first;
2794               curr = x.second;
2795               do
2796                 curr = op_iter_next_use (&curri);
2797               /* Skip already visited or non-SSA operands (from iterating
2798                  over PHI args).  */
2799               while (curr != NULL_USE_OPERAND_P
2800                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2801                          || ! bitmap_set_bit (visited,
2802                                               SSA_NAME_VERSION
2803                                                 (USE_FROM_PTR (curr)))));
2804             }
2805           while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
2806           if (curr == NULL_USE_OPERAND_P)
2807             break;
2808         }
2809       else
2810         {
2811           if (gimple_code (def) == GIMPLE_PHI)
2812             curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
2813           else
2814             curr = op_iter_init_use (&curri, def, SSA_OP_USE);
2815           while (curr != NULL_USE_OPERAND_P
2816                  && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2817                      || ! bitmap_set_bit (visited,
2818                                           SSA_NAME_VERSION
2819                                             (USE_FROM_PTR (curr)))))
2820             curr = op_iter_next_use (&curri);
2821           if (curr == NULL_USE_OPERAND_P)
2822             goto pop;
2823         }
2824     }
2825   while (1);
2826   if (dump_file && (dump_flags & TDF_DETAILS))
2827     {
2828       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
2829       unsigned i;
2830       std::pair<ssa_op_iter, use_operand_p> *x;
2831       FOR_EACH_VEC_ELT (path, i, x)
2832         dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
2833       dump_printf (MSG_NOTE, "\n");
2834     }
2835
2836   /* Check whether the reduction path detected is valid.  */
2837   bool fail = path.length () == 0;
2838   bool neg = false;
2839   for (unsigned i = 1; i < path.length (); ++i)
2840     {
2841       gimple *use_stmt = USE_STMT (path[i].second);
2842       tree op = USE_FROM_PTR (path[i].second);
2843       if (! has_single_use (op)
2844           || ! is_gimple_assign (use_stmt))
2845         {
2846           fail = true;
2847           break;
2848         }
2849       if (gimple_assign_rhs_code (use_stmt) != code)
2850         {
2851           if (code == PLUS_EXPR
2852               && gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
2853             {
2854               /* Track whether we negate the reduction value each iteration.  */
2855               if (gimple_assign_rhs2 (use_stmt) == op)
2856                 neg = ! neg;
2857             }
2858           else
2859             {
2860               fail = true;
2861               break;
2862             }
2863         }
2864     }
2865   return ! fail && ! neg;
2866 }
2867
2868
2869 /* Function vect_is_simple_reduction
2870
2871    (1) Detect a cross-iteration def-use cycle that represents a simple
2872    reduction computation.  We look for the following pattern:
2873
2874    loop_header:
2875      a1 = phi < a0, a2 >
2876      a3 = ...
2877      a2 = operation (a3, a1)
2878
2879    or
2880
2881    a3 = ...
2882    loop_header:
2883      a1 = phi < a0, a2 >
2884      a2 = operation (a3, a1)
2885
2886    such that:
2887    1. operation is commutative and associative and it is safe to
2888       change the order of the computation
2889    2. no uses for a2 in the loop (a2 is used out of the loop)
2890    3. no uses of a1 in the loop besides the reduction operation
2891    4. no uses of a1 outside the loop.
2892
2893    Conditions 1,4 are tested here.
2894    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2895
2896    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2897    nested cycles.
2898
2899    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2900    reductions:
2901
2902      a1 = phi < a0, a2 >
2903      inner loop (def of a3)
2904      a2 = phi < a3 >
2905
2906    (4) Detect condition expressions, ie:
2907      for (int i = 0; i < N; i++)
2908        if (a[i] < val)
2909         ret_val = a[i];
2910
2911 */
2912
2913 static stmt_vec_info
2914 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
2915                           bool *double_reduc,
2916                           bool need_wrapping_integral_overflow,
2917                           enum vect_reduction_type *v_reduc_type)
2918 {
2919   gphi *phi = as_a <gphi *> (phi_info->stmt);
2920   struct loop *loop = (gimple_bb (phi))->loop_father;
2921   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2922   bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
2923   gimple *phi_use_stmt = NULL;
2924   enum tree_code orig_code, code;
2925   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2926   tree type;
2927   tree name;
2928   imm_use_iterator imm_iter;
2929   use_operand_p use_p;
2930   bool phi_def;
2931
2932   *double_reduc = false;
2933   *v_reduc_type = TREE_CODE_REDUCTION;
2934
2935   tree phi_name = PHI_RESULT (phi);
2936   /* ???  If there are no uses of the PHI result the inner loop reduction
2937      won't be detected as possibly double-reduction by vectorizable_reduction
2938      because that tries to walk the PHI arg from the preheader edge which
2939      can be constant.  See PR60382.  */
2940   if (has_zero_uses (phi_name))
2941     return NULL;
2942   unsigned nphi_def_loop_uses = 0;
2943   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
2944     {
2945       gimple *use_stmt = USE_STMT (use_p);
2946       if (is_gimple_debug (use_stmt))
2947         continue;
2948
2949       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2950         {
2951           if (dump_enabled_p ())
2952             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2953                              "intermediate value used outside loop.\n");
2954
2955           return NULL;
2956         }
2957
2958       nphi_def_loop_uses++;
2959       phi_use_stmt = use_stmt;
2960     }
2961
2962   edge latch_e = loop_latch_edge (loop);
2963   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2964   if (TREE_CODE (loop_arg) != SSA_NAME)
2965     {
2966       if (dump_enabled_p ())
2967         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2968                          "reduction: not ssa_name: %T\n", loop_arg);
2969       return NULL;
2970     }
2971
2972   stmt_vec_info def_stmt_info = loop_info->lookup_def (loop_arg);
2973   if (!def_stmt_info
2974       || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
2975     return NULL;
2976
2977   if (gassign *def_stmt = dyn_cast <gassign *> (def_stmt_info->stmt))
2978     {
2979       name = gimple_assign_lhs (def_stmt);
2980       phi_def = false;
2981     }
2982   else if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
2983     {
2984       name = PHI_RESULT (def_stmt);
2985       phi_def = true;
2986     }
2987   else
2988     {
2989       if (dump_enabled_p ())
2990         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2991                          "reduction: unhandled reduction operation: %G",
2992                          def_stmt_info->stmt);
2993       return NULL;
2994     }
2995
2996   unsigned nlatch_def_loop_uses = 0;
2997   auto_vec<gphi *, 3> lcphis;
2998   bool inner_loop_of_double_reduc = false;
2999   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
3000     {
3001       gimple *use_stmt = USE_STMT (use_p);
3002       if (is_gimple_debug (use_stmt))
3003         continue;
3004       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3005         nlatch_def_loop_uses++;
3006       else
3007         {
3008           /* We can have more than one loop-closed PHI.  */
3009           lcphis.safe_push (as_a <gphi *> (use_stmt));
3010           if (nested_in_vect_loop
3011               && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
3012                   == vect_double_reduction_def))
3013             inner_loop_of_double_reduc = true;
3014         }
3015     }
3016
3017   /* If this isn't a nested cycle or if the nested cycle reduction value
3018      is used ouside of the inner loop we cannot handle uses of the reduction
3019      value.  */
3020   if ((!nested_in_vect_loop || inner_loop_of_double_reduc)
3021       && (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1))
3022     {
3023       if (dump_enabled_p ())
3024         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3025                          "reduction used in loop.\n");
3026       return NULL;
3027     }
3028
3029   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3030      defined in the inner loop.  */
3031   if (phi_def)
3032     {
3033       gphi *def_stmt = as_a <gphi *> (def_stmt_info->stmt);
3034       op1 = PHI_ARG_DEF (def_stmt, 0);
3035
3036       if (gimple_phi_num_args (def_stmt) != 1
3037           || TREE_CODE (op1) != SSA_NAME)
3038         {
3039           if (dump_enabled_p ())
3040             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3041                              "unsupported phi node definition.\n");
3042
3043           return NULL;
3044         }
3045
3046       gimple *def1 = SSA_NAME_DEF_STMT (op1);
3047       if (gimple_bb (def1)
3048           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3049           && loop->inner
3050           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3051           && is_gimple_assign (def1)
3052           && is_a <gphi *> (phi_use_stmt)
3053           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3054         {
3055           if (dump_enabled_p ())
3056             report_vect_op (MSG_NOTE, def_stmt,
3057                             "detected double reduction: ");
3058
3059           *double_reduc = true;
3060           return def_stmt_info;
3061         }
3062
3063       return NULL;
3064     }
3065
3066   /* If we are vectorizing an inner reduction we are executing that
3067      in the original order only in case we are not dealing with a
3068      double reduction.  */
3069   bool check_reduction = true;
3070   if (flow_loop_nested_p (vect_loop, loop))
3071     {
3072       gphi *lcphi;
3073       unsigned i;
3074       check_reduction = false;
3075       FOR_EACH_VEC_ELT (lcphis, i, lcphi)
3076         FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result (lcphi))
3077           {
3078             gimple *use_stmt = USE_STMT (use_p);
3079             if (is_gimple_debug (use_stmt))
3080               continue;
3081             if (! flow_bb_inside_loop_p (vect_loop, gimple_bb (use_stmt)))
3082               check_reduction = true;
3083           }
3084     }
3085
3086   gassign *def_stmt = as_a <gassign *> (def_stmt_info->stmt);
3087   code = orig_code = gimple_assign_rhs_code (def_stmt);
3088
3089   if (nested_in_vect_loop && !check_reduction)
3090     {
3091       /* FIXME: Even for non-reductions code generation is funneled
3092          through vectorizable_reduction for the stmt defining the
3093          PHI latch value.  So we have to artificially restrict ourselves
3094          for the supported operations.  */
3095       switch (get_gimple_rhs_class (code))
3096         {
3097         case GIMPLE_BINARY_RHS:
3098         case GIMPLE_TERNARY_RHS:
3099           break;
3100         default:
3101           /* Not supported by vectorizable_reduction.  */
3102           if (dump_enabled_p ())
3103             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3104                             "nested cycle: not handled operation: ");
3105           return NULL;
3106         }
3107       if (dump_enabled_p ())
3108         report_vect_op (MSG_NOTE, def_stmt, "detected nested cycle: ");
3109       return def_stmt_info;
3110     }
3111
3112   /* We can handle "res -= x[i]", which is non-associative by
3113      simply rewriting this into "res += -x[i]".  Avoid changing
3114      gimple instruction for the first simple tests and only do this
3115      if we're allowed to change code at all.  */
3116   if (code == MINUS_EXPR && gimple_assign_rhs2 (def_stmt) != phi_name)
3117     code = PLUS_EXPR;
3118
3119   if (code == COND_EXPR)
3120     {
3121       if (! nested_in_vect_loop)
3122         *v_reduc_type = COND_REDUCTION;
3123
3124       op3 = gimple_assign_rhs1 (def_stmt);
3125       if (COMPARISON_CLASS_P (op3))
3126         {
3127           op4 = TREE_OPERAND (op3, 1);
3128           op3 = TREE_OPERAND (op3, 0);
3129         }
3130       if (op3 == phi_name || op4 == phi_name)
3131         {
3132           if (dump_enabled_p ())
3133             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3134                             "reduction: condition depends on previous"
3135                             " iteration: ");
3136           return NULL;
3137         }
3138
3139       op1 = gimple_assign_rhs2 (def_stmt);
3140       op2 = gimple_assign_rhs3 (def_stmt);
3141     }
3142   else if (!commutative_tree_code (code) || !associative_tree_code (code))
3143     {
3144       if (dump_enabled_p ())
3145         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3146                         "reduction: not commutative/associative: ");
3147       return NULL;
3148     }
3149   else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS)
3150     {
3151       op1 = gimple_assign_rhs1 (def_stmt);
3152       op2 = gimple_assign_rhs2 (def_stmt);
3153     }
3154   else
3155     {
3156       if (dump_enabled_p ())
3157         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3158                         "reduction: not handled operation: ");
3159       return NULL;
3160     }
3161
3162   if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
3163     {
3164       if (dump_enabled_p ())
3165         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3166                         "reduction: both uses not ssa_names: ");
3167
3168       return NULL;
3169     }
3170
3171   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
3172   if ((TREE_CODE (op1) == SSA_NAME
3173        && !types_compatible_p (type,TREE_TYPE (op1)))
3174       || (TREE_CODE (op2) == SSA_NAME
3175           && !types_compatible_p (type, TREE_TYPE (op2)))
3176       || (op3 && TREE_CODE (op3) == SSA_NAME
3177           && !types_compatible_p (type, TREE_TYPE (op3)))
3178       || (op4 && TREE_CODE (op4) == SSA_NAME
3179           && !types_compatible_p (type, TREE_TYPE (op4))))
3180     {
3181       if (dump_enabled_p ())
3182         {
3183           dump_printf_loc (MSG_NOTE, vect_location,
3184                            "reduction: multiple types: operation type: "
3185                            "%T, operands types: %T,%T",
3186                            type,  TREE_TYPE (op1), TREE_TYPE (op2));
3187           if (op3)
3188             dump_printf (MSG_NOTE, ",%T", TREE_TYPE (op3));
3189
3190           if (op4)
3191             dump_printf (MSG_NOTE, ",%T", TREE_TYPE (op4));
3192           dump_printf (MSG_NOTE, "\n");
3193         }
3194
3195       return NULL;
3196     }
3197
3198   /* Check whether it's ok to change the order of the computation.
3199      Generally, when vectorizing a reduction we change the order of the
3200      computation.  This may change the behavior of the program in some
3201      cases, so we need to check that this is ok.  One exception is when
3202      vectorizing an outer-loop: the inner-loop is executed sequentially,
3203      and therefore vectorizing reductions in the inner-loop during
3204      outer-loop vectorization is safe.  */
3205   if (check_reduction
3206       && *v_reduc_type == TREE_CODE_REDUCTION
3207       && needs_fold_left_reduction_p (type, code,
3208                                       need_wrapping_integral_overflow))
3209     *v_reduc_type = FOLD_LEFT_REDUCTION;
3210
3211   /* Reduction is safe. We're dealing with one of the following:
3212      1) integer arithmetic and no trapv
3213      2) floating point arithmetic, and special flags permit this optimization
3214      3) nested cycle (i.e., outer loop vectorization).  */
3215   stmt_vec_info def1_info = loop_info->lookup_def (op1);
3216   stmt_vec_info def2_info = loop_info->lookup_def (op2);
3217   if (code != COND_EXPR && !def1_info && !def2_info)
3218     {
3219       if (dump_enabled_p ())
3220         report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
3221       return NULL;
3222     }
3223
3224   /* Check that one def is the reduction def, defined by PHI,
3225      the other def is either defined in the loop ("vect_internal_def"),
3226      or it's an induction (defined by a loop-header phi-node).  */
3227
3228   if (def2_info
3229       && def2_info->stmt == phi
3230       && (code == COND_EXPR
3231           || !def1_info
3232           || !flow_bb_inside_loop_p (loop, gimple_bb (def1_info->stmt))
3233           || vect_valid_reduction_input_p (def1_info)))
3234     {
3235       if (dump_enabled_p ())
3236         report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3237       return def_stmt_info;
3238     }
3239
3240   if (def1_info
3241       && def1_info->stmt == phi
3242       && (code == COND_EXPR
3243           || !def2_info
3244           || !flow_bb_inside_loop_p (loop, gimple_bb (def2_info->stmt))
3245           || vect_valid_reduction_input_p (def2_info)))
3246     {
3247       if (! nested_in_vect_loop && orig_code != MINUS_EXPR)
3248         {
3249           /* Check if we can swap operands (just for simplicity - so that
3250              the rest of the code can assume that the reduction variable
3251              is always the last (second) argument).  */
3252           if (code == COND_EXPR)
3253             {
3254               /* Swap cond_expr by inverting the condition.  */
3255               tree cond_expr = gimple_assign_rhs1 (def_stmt);
3256               enum tree_code invert_code = ERROR_MARK;
3257               enum tree_code cond_code = TREE_CODE (cond_expr);
3258
3259               if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
3260                 {
3261                   bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
3262                   invert_code = invert_tree_comparison (cond_code, honor_nans);
3263                 }
3264               if (invert_code != ERROR_MARK)
3265                 {
3266                   TREE_SET_CODE (cond_expr, invert_code);
3267                   swap_ssa_operands (def_stmt,
3268                                      gimple_assign_rhs2_ptr (def_stmt),
3269                                      gimple_assign_rhs3_ptr (def_stmt));
3270                 }
3271               else
3272                 {
3273                   if (dump_enabled_p ())
3274                     report_vect_op (MSG_NOTE, def_stmt,
3275                                     "detected reduction: cannot swap operands "
3276                                     "for cond_expr");
3277                   return NULL;
3278                 }
3279             }
3280           else
3281             swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
3282                                gimple_assign_rhs2_ptr (def_stmt));
3283
3284           if (dump_enabled_p ())
3285             report_vect_op (MSG_NOTE, def_stmt,
3286                             "detected reduction: need to swap operands: ");
3287
3288           if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
3289             LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
3290         }
3291       else
3292         {
3293           if (dump_enabled_p ())
3294             report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3295         }
3296
3297       return def_stmt_info;
3298     }
3299
3300   /* Try to find SLP reduction chain.  */
3301   if (! nested_in_vect_loop
3302       && code != COND_EXPR
3303       && orig_code != MINUS_EXPR
3304       && vect_is_slp_reduction (loop_info, phi, def_stmt))
3305     {
3306       if (dump_enabled_p ())
3307         report_vect_op (MSG_NOTE, def_stmt,
3308                         "reduction: detected reduction chain: ");
3309
3310       return def_stmt_info;
3311     }
3312
3313   /* Look for the expression computing loop_arg from loop PHI result.  */
3314   if (check_reduction_path (vect_location, loop, phi, loop_arg, code))
3315     return def_stmt_info;
3316
3317   if (dump_enabled_p ())
3318     {
3319       report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3320                       "reduction: unknown pattern: ");
3321     }
3322
3323   return NULL;
3324 }
3325
3326 /* Wrapper around vect_is_simple_reduction, which will modify code
3327    in-place if it enables detection of more reductions.  Arguments
3328    as there.  */
3329
3330 stmt_vec_info
3331 vect_force_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3332                              bool *double_reduc,
3333                              bool need_wrapping_integral_overflow)
3334 {
3335   enum vect_reduction_type v_reduc_type;
3336   stmt_vec_info def_info
3337     = vect_is_simple_reduction (loop_info, phi_info, double_reduc,
3338                                 need_wrapping_integral_overflow,
3339                                 &v_reduc_type);
3340   if (def_info)
3341     {
3342       STMT_VINFO_REDUC_TYPE (phi_info) = v_reduc_type;
3343       STMT_VINFO_REDUC_DEF (phi_info) = def_info;
3344       STMT_VINFO_REDUC_TYPE (def_info) = v_reduc_type;
3345       STMT_VINFO_REDUC_DEF (def_info) = phi_info;
3346     }
3347   return def_info;
3348 }
3349
3350 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
3351 int
3352 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3353                              int *peel_iters_epilogue,
3354                              stmt_vector_for_cost *scalar_cost_vec,
3355                              stmt_vector_for_cost *prologue_cost_vec,
3356                              stmt_vector_for_cost *epilogue_cost_vec)
3357 {
3358   int retval = 0;
3359   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3360
3361   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3362     {
3363       *peel_iters_epilogue = assumed_vf / 2;
3364       if (dump_enabled_p ())
3365         dump_printf_loc (MSG_NOTE, vect_location,
3366                          "cost model: epilogue peel iters set to vf/2 "
3367                          "because loop iterations are unknown .\n");
3368
3369       /* If peeled iterations are known but number of scalar loop
3370          iterations are unknown, count a taken branch per peeled loop.  */
3371       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3372                                  NULL, 0, vect_prologue);
3373       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3374                                  NULL, 0, vect_epilogue);
3375     }
3376   else
3377     {
3378       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3379       peel_iters_prologue = niters < peel_iters_prologue ?
3380                             niters : peel_iters_prologue;
3381       *peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3382       /* If we need to peel for gaps, but no peeling is required, we have to
3383          peel VF iterations.  */
3384       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3385         *peel_iters_epilogue = assumed_vf;
3386     }
3387
3388   stmt_info_for_cost *si;
3389   int j;
3390   if (peel_iters_prologue)
3391     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3392       retval += record_stmt_cost (prologue_cost_vec,
3393                                   si->count * peel_iters_prologue,
3394                                   si->kind, si->stmt_info, si->misalign,
3395                                   vect_prologue);
3396   if (*peel_iters_epilogue)
3397     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3398       retval += record_stmt_cost (epilogue_cost_vec,
3399                                   si->count * *peel_iters_epilogue,
3400                                   si->kind, si->stmt_info, si->misalign,
3401                                   vect_epilogue);
3402
3403   return retval;
3404 }
3405
3406 /* Function vect_estimate_min_profitable_iters
3407
3408    Return the number of iterations required for the vector version of the
3409    loop to be profitable relative to the cost of the scalar version of the
3410    loop.
3411
3412    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3413    of iterations for vectorization.  -1 value means loop vectorization
3414    is not profitable.  This returned value may be used for dynamic
3415    profitability check.
3416
3417    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3418    for static check against estimated number of iterations.  */
3419
3420 static void
3421 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3422                                     int *ret_min_profitable_niters,
3423                                     int *ret_min_profitable_estimate)
3424 {
3425   int min_profitable_iters;
3426   int min_profitable_estimate;
3427   int peel_iters_prologue;
3428   int peel_iters_epilogue;
3429   unsigned vec_inside_cost = 0;
3430   int vec_outside_cost = 0;
3431   unsigned vec_prologue_cost = 0;
3432   unsigned vec_epilogue_cost = 0;
3433   int scalar_single_iter_cost = 0;
3434   int scalar_outside_cost = 0;
3435   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3436   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3437   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3438
3439   /* Cost model disabled.  */
3440   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3441     {
3442       if (dump_enabled_p ())
3443         dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3444       *ret_min_profitable_niters = 0;
3445       *ret_min_profitable_estimate = 0;
3446       return;
3447     }
3448
3449   /* Requires loop versioning tests to handle misalignment.  */
3450   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3451     {
3452       /*  FIXME: Make cost depend on complexity of individual check.  */
3453       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3454       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3455                             vect_prologue);
3456       if (dump_enabled_p ())
3457         dump_printf (MSG_NOTE,
3458                      "cost model: Adding cost of checks for loop "
3459                      "versioning to treat misalignment.\n");
3460     }
3461
3462   /* Requires loop versioning with alias checks.  */
3463   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3464     {
3465       /*  FIXME: Make cost depend on complexity of individual check.  */
3466       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3467       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3468                             vect_prologue);
3469       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3470       if (len)
3471         /* Count LEN - 1 ANDs and LEN comparisons.  */
3472         (void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt,
3473                               NULL, 0, vect_prologue);
3474       len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3475       if (len)
3476         {
3477           /* Count LEN - 1 ANDs and LEN comparisons.  */
3478           unsigned int nstmts = len * 2 - 1;
3479           /* +1 for each bias that needs adding.  */
3480           for (unsigned int i = 0; i < len; ++i)
3481             if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3482               nstmts += 1;
3483           (void) add_stmt_cost (target_cost_data, nstmts, scalar_stmt,
3484                                 NULL, 0, vect_prologue);
3485         }
3486       if (dump_enabled_p ())
3487         dump_printf (MSG_NOTE,
3488                      "cost model: Adding cost of checks for loop "
3489                      "versioning aliasing.\n");
3490     }
3491
3492   /* Requires loop versioning with niter checks.  */
3493   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3494     {
3495       /*  FIXME: Make cost depend on complexity of individual check.  */
3496       (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3497                             vect_prologue);
3498       if (dump_enabled_p ())
3499         dump_printf (MSG_NOTE,
3500                      "cost model: Adding cost of checks for loop "
3501                      "versioning niters.\n");
3502     }
3503
3504   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3505     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3506                           vect_prologue);
3507
3508   /* Count statements in scalar loop.  Using this as scalar cost for a single
3509      iteration for now.
3510
3511      TODO: Add outer loop support.
3512
3513      TODO: Consider assigning different costs to different scalar
3514      statements.  */
3515
3516   scalar_single_iter_cost
3517     = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3518
3519   /* Add additional cost for the peeled instructions in prologue and epilogue
3520      loop.  (For fully-masked loops there will be no peeling.)
3521
3522      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3523      at compile-time - we assume it's vf/2 (the worst would be vf-1).
3524
3525      TODO: Build an expression that represents peel_iters for prologue and
3526      epilogue to be used in a run-time test.  */
3527
3528   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3529     {
3530       peel_iters_prologue = 0;
3531       peel_iters_epilogue = 0;
3532
3533       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3534         {
3535           /* We need to peel exactly one iteration.  */
3536           peel_iters_epilogue += 1;
3537           stmt_info_for_cost *si;
3538           int j;
3539           FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
3540                             j, si)
3541             (void) add_stmt_cost (target_cost_data, si->count,
3542                                   si->kind, si->stmt_info, si->misalign,
3543                                   vect_epilogue);
3544         }
3545     }
3546   else if (npeel < 0)
3547     {
3548       peel_iters_prologue = assumed_vf / 2;
3549       if (dump_enabled_p ())
3550         dump_printf (MSG_NOTE, "cost model: "
3551                      "prologue peel iters set to vf/2.\n");
3552
3553       /* If peeling for alignment is unknown, loop bound of main loop becomes
3554          unknown.  */
3555       peel_iters_epilogue = assumed_vf / 2;
3556       if (dump_enabled_p ())
3557         dump_printf (MSG_NOTE, "cost model: "
3558                      "epilogue peel iters set to vf/2 because "
3559                      "peeling for alignment is unknown.\n");
3560
3561       /* If peeled iterations are unknown, count a taken branch and a not taken
3562          branch per peeled loop. Even if scalar loop iterations are known,
3563          vector iterations are not known since peeled prologue iterations are
3564          not known. Hence guards remain the same.  */
3565       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3566                             NULL, 0, vect_prologue);
3567       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3568                             NULL, 0, vect_prologue);
3569       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3570                             NULL, 0, vect_epilogue);
3571       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3572                             NULL, 0, vect_epilogue);
3573       stmt_info_for_cost *si;
3574       int j;
3575       FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3576         {
3577           (void) add_stmt_cost (target_cost_data,
3578                                 si->count * peel_iters_prologue,
3579                                 si->kind, si->stmt_info, si->misalign,
3580                                 vect_prologue);
3581           (void) add_stmt_cost (target_cost_data,
3582                                 si->count * peel_iters_epilogue,
3583                                 si->kind, si->stmt_info, si->misalign,
3584                                 vect_epilogue);
3585         }
3586     }
3587   else
3588     {
3589       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3590       stmt_info_for_cost *si;
3591       int j;
3592       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3593
3594       prologue_cost_vec.create (2);
3595       epilogue_cost_vec.create (2);
3596       peel_iters_prologue = npeel;
3597
3598       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3599                                           &peel_iters_epilogue,
3600                                           &LOOP_VINFO_SCALAR_ITERATION_COST
3601                                             (loop_vinfo),
3602                                           &prologue_cost_vec,
3603                                           &epilogue_cost_vec);
3604
3605       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3606         (void) add_stmt_cost (data, si->count, si->kind, si->stmt_info,
3607                               si->misalign, vect_prologue);
3608
3609       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3610         (void) add_stmt_cost (data, si->count, si->kind, si->stmt_info,
3611                               si->misalign, vect_epilogue);
3612
3613       prologue_cost_vec.release ();
3614       epilogue_cost_vec.release ();
3615     }
3616
3617   /* FORNOW: The scalar outside cost is incremented in one of the
3618      following ways:
3619
3620      1. The vectorizer checks for alignment and aliasing and generates
3621      a condition that allows dynamic vectorization.  A cost model
3622      check is ANDED with the versioning condition.  Hence scalar code
3623      path now has the added cost of the versioning check.
3624
3625        if (cost > th & versioning_check)
3626          jmp to vector code
3627
3628      Hence run-time scalar is incremented by not-taken branch cost.
3629
3630      2. The vectorizer then checks if a prologue is required.  If the
3631      cost model check was not done before during versioning, it has to
3632      be done before the prologue check.
3633
3634        if (cost <= th)
3635          prologue = scalar_iters
3636        if (prologue == 0)
3637          jmp to vector code
3638        else
3639          execute prologue
3640        if (prologue == num_iters)
3641          go to exit
3642
3643      Hence the run-time scalar cost is incremented by a taken branch,
3644      plus a not-taken branch, plus a taken branch cost.
3645
3646      3. The vectorizer then checks if an epilogue is required.  If the
3647      cost model check was not done before during prologue check, it
3648      has to be done with the epilogue check.
3649
3650        if (prologue == 0)
3651          jmp to vector code
3652        else
3653          execute prologue
3654        if (prologue == num_iters)
3655          go to exit
3656        vector code:
3657          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3658            jmp to epilogue
3659
3660      Hence the run-time scalar cost should be incremented by 2 taken
3661      branches.
3662
3663      TODO: The back end may reorder the BBS's differently and reverse
3664      conditions/branch directions.  Change the estimates below to
3665      something more reasonable.  */
3666
3667   /* If the number of iterations is known and we do not do versioning, we can
3668      decide whether to vectorize at compile time.  Hence the scalar version
3669      do not carry cost model guard costs.  */
3670   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3671       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3672     {
3673       /* Cost model check occurs at versioning.  */
3674       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3675         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3676       else
3677         {
3678           /* Cost model check occurs at prologue generation.  */
3679           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3680             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3681               + vect_get_stmt_cost (cond_branch_not_taken);
3682           /* Cost model check occurs at epilogue generation.  */
3683           else
3684             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3685         }
3686     }
3687
3688   /* Complete the target-specific cost calculations.  */
3689   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3690                &vec_inside_cost, &vec_epilogue_cost);
3691
3692   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3693
3694   if (dump_enabled_p ())
3695     {
3696       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3697       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
3698                    vec_inside_cost);
3699       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
3700                    vec_prologue_cost);
3701       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
3702                    vec_epilogue_cost);
3703       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
3704                    scalar_single_iter_cost);
3705       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
3706                    scalar_outside_cost);
3707       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
3708                    vec_outside_cost);
3709       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
3710                    peel_iters_prologue);
3711       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
3712                    peel_iters_epilogue);
3713     }
3714
3715   /* Calculate number of iterations required to make the vector version
3716      profitable, relative to the loop bodies only.  The following condition
3717      must hold true:
3718      SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
3719      where
3720      SIC = scalar iteration cost, VIC = vector iteration cost,
3721      VOC = vector outside cost, VF = vectorization factor,
3722      NPEEL = prologue iterations + epilogue iterations,
3723      SOC = scalar outside cost for run time cost model check.  */
3724
3725   int saving_per_viter = (scalar_single_iter_cost * assumed_vf
3726                           - vec_inside_cost);
3727   if (saving_per_viter <= 0)
3728     {
3729       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3730         warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
3731                     "vectorization did not happen for a simd loop");
3732
3733       if (dump_enabled_p ())
3734         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3735                          "cost model: the vector iteration cost = %d "
3736                          "divided by the scalar iteration cost = %d "
3737                          "is greater or equal to the vectorization factor = %d"
3738                          ".\n",
3739                          vec_inside_cost, scalar_single_iter_cost, assumed_vf);
3740       *ret_min_profitable_niters = -1;
3741       *ret_min_profitable_estimate = -1;
3742       return;
3743     }
3744
3745   /* ??? The "if" arm is written to handle all cases; see below for what
3746      we would do for !LOOP_VINFO_FULLY_MASKED_P.  */
3747   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3748     {
3749       /* Rewriting the condition above in terms of the number of
3750          vector iterations (vniters) rather than the number of
3751          scalar iterations (niters) gives:
3752
3753          SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
3754
3755          <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
3756
3757          For integer N, X and Y when X > 0:
3758
3759          N * X > Y <==> N >= (Y /[floor] X) + 1.  */
3760       int outside_overhead = (vec_outside_cost
3761                               - scalar_single_iter_cost * peel_iters_prologue
3762                               - scalar_single_iter_cost * peel_iters_epilogue
3763                               - scalar_outside_cost);
3764       /* We're only interested in cases that require at least one
3765          vector iteration.  */
3766       int min_vec_niters = 1;
3767       if (outside_overhead > 0)
3768         min_vec_niters = outside_overhead / saving_per_viter + 1;
3769
3770       if (dump_enabled_p ())
3771         dump_printf (MSG_NOTE, "  Minimum number of vector iterations: %d\n",
3772                      min_vec_niters);
3773
3774       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3775         {
3776           /* Now that we know the minimum number of vector iterations,
3777              find the minimum niters for which the scalar cost is larger:
3778
3779              SIC * niters > VIC * vniters + VOC - SOC
3780
3781              We know that the minimum niters is no more than
3782              vniters * VF + NPEEL, but it might be (and often is) less
3783              than that if a partial vector iteration is cheaper than the
3784              equivalent scalar code.  */
3785           int threshold = (vec_inside_cost * min_vec_niters
3786                            + vec_outside_cost
3787                            - scalar_outside_cost);
3788           if (threshold <= 0)
3789             min_profitable_iters = 1;
3790           else
3791             min_profitable_iters = threshold / scalar_single_iter_cost + 1;
3792         }
3793       else
3794         /* Convert the number of vector iterations into a number of
3795            scalar iterations.  */
3796         min_profitable_iters = (min_vec_niters * assumed_vf
3797                                 + peel_iters_prologue
3798                                 + peel_iters_epilogue);
3799     }
3800   else
3801     {
3802       min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
3803                               * assumed_vf
3804                               - vec_inside_cost * peel_iters_prologue
3805                               - vec_inside_cost * peel_iters_epilogue);
3806       if (min_profitable_iters <= 0)
3807         min_profitable_iters = 0;
3808       else
3809         {
3810           min_profitable_iters /= saving_per_viter;
3811
3812           if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
3813               <= (((int) vec_inside_cost * min_profitable_iters)
3814                   + (((int) vec_outside_cost - scalar_outside_cost)
3815                      * assumed_vf)))
3816             min_profitable_iters++;
3817         }
3818     }
3819
3820   if (dump_enabled_p ())
3821     dump_printf (MSG_NOTE,
3822                  "  Calculated minimum iters for profitability: %d\n",
3823                  min_profitable_iters);
3824
3825   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
3826       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
3827     /* We want the vectorized loop to execute at least once.  */
3828     min_profitable_iters = assumed_vf + peel_iters_prologue;
3829
3830   if (dump_enabled_p ())
3831     dump_printf_loc (MSG_NOTE, vect_location,
3832                      "  Runtime profitability threshold = %d\n",
3833                      min_profitable_iters);
3834
3835   *ret_min_profitable_niters = min_profitable_iters;
3836
3837   /* Calculate number of iterations required to make the vector version
3838      profitable, relative to the loop bodies only.
3839
3840      Non-vectorized variant is SIC * niters and it must win over vector
3841      variant on the expected loop trip count.  The following condition must hold true:
3842      SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC  */
3843
3844   if (vec_outside_cost <= 0)
3845     min_profitable_estimate = 0;
3846   else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3847     {
3848       /* This is a repeat of the code above, but with + SOC rather
3849          than - SOC.  */
3850       int outside_overhead = (vec_outside_cost
3851                               - scalar_single_iter_cost * peel_iters_prologue
3852                               - scalar_single_iter_cost * peel_iters_epilogue
3853                               + scalar_outside_cost);
3854       int min_vec_niters = 1;
3855       if (outside_overhead > 0)
3856         min_vec_niters = outside_overhead / saving_per_viter + 1;
3857
3858       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3859         {
3860           int threshold = (vec_inside_cost * min_vec_niters
3861                            + vec_outside_cost
3862                            + scalar_outside_cost);
3863           min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
3864         }
3865       else
3866         min_profitable_estimate = (min_vec_niters * assumed_vf
3867                                    + peel_iters_prologue
3868                                    + peel_iters_epilogue);
3869     }
3870   else
3871     {
3872       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
3873                                  * assumed_vf
3874                                  - vec_inside_cost * peel_iters_prologue
3875                                  - vec_inside_cost * peel_iters_epilogue)
3876                                  / ((scalar_single_iter_cost * assumed_vf)
3877                                    - vec_inside_cost);
3878     }
3879   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3880   if (dump_enabled_p ())
3881     dump_printf_loc (MSG_NOTE, vect_location,
3882                      "  Static estimate profitability threshold = %d\n",
3883                      min_profitable_estimate);
3884
3885   *ret_min_profitable_estimate = min_profitable_estimate;
3886 }
3887
3888 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3889    vector elements (not bits) for a vector with NELT elements.  */
3890 static void
3891 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
3892                               vec_perm_builder *sel)
3893 {
3894   /* The encoding is a single stepped pattern.  Any wrap-around is handled
3895      by vec_perm_indices.  */
3896   sel->new_vector (nelt, 1, 3);
3897   for (unsigned int i = 0; i < 3; i++)
3898     sel->quick_push (i + offset);
3899 }
3900
3901 /* Checks whether the target supports whole-vector shifts for vectors of mode
3902    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
3903    it supports vec_perm_const with masks for all necessary shift amounts.  */
3904 static bool
3905 have_whole_vector_shift (machine_mode mode)
3906 {
3907   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3908     return true;
3909
3910   /* Variable-length vectors should be handled via the optab.  */
3911   unsigned int nelt;
3912   if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
3913     return false;
3914
3915   vec_perm_builder sel;
3916   vec_perm_indices indices;
3917   for (unsigned int i = nelt / 2; i >= 1; i /= 2)
3918     {
3919       calc_vec_perm_mask_for_shift (i, nelt, &sel);
3920       indices.new_vector (sel, 2, nelt);
3921       if (!can_vec_perm_const_p (mode, indices, false))
3922         return false;
3923     }
3924   return true;
3925 }
3926
3927 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3928    functions. Design better to avoid maintenance issues.  */
3929
3930 /* Function vect_model_reduction_cost.
3931
3932    Models cost for a reduction operation, including the vector ops
3933    generated within the strip-mine loop, the initial definition before
3934    the loop, and the epilogue code that must be generated.  */
3935
3936 static void
3937 vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
3938                            int ncopies, stmt_vector_for_cost *cost_vec)
3939 {
3940   int prologue_cost = 0, epilogue_cost = 0, inside_cost;
3941   enum tree_code code;
3942   optab optab;
3943   tree vectype;
3944   machine_mode mode;
3945   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3946   struct loop *loop = NULL;
3947
3948   if (loop_vinfo)
3949     loop = LOOP_VINFO_LOOP (loop_vinfo);
3950
3951   /* Condition reductions generate two reductions in the loop.  */
3952   vect_reduction_type reduction_type
3953     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
3954   if (reduction_type == COND_REDUCTION)
3955     ncopies *= 2;
3956
3957   vectype = STMT_VINFO_VECTYPE (stmt_info);
3958   mode = TYPE_MODE (vectype);
3959   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
3960
3961   code = gimple_assign_rhs_code (orig_stmt_info->stmt);
3962
3963   if (reduction_type == EXTRACT_LAST_REDUCTION
3964       || reduction_type == FOLD_LEFT_REDUCTION)
3965     {
3966       /* No extra instructions needed in the prologue.  */
3967       prologue_cost = 0;
3968
3969       if (reduction_type == EXTRACT_LAST_REDUCTION || reduc_fn != IFN_LAST)
3970         /* Count one reduction-like operation per vector.  */
3971         inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
3972                                         stmt_info, 0, vect_body);
3973       else
3974         {
3975           /* Use NELEMENTS extracts and NELEMENTS scalar ops.  */
3976           unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
3977           inside_cost = record_stmt_cost (cost_vec, nelements,
3978                                           vec_to_scalar, stmt_info, 0,
3979                                           vect_body);
3980           inside_cost += record_stmt_cost (cost_vec, nelements,
3981                                            scalar_stmt, stmt_info, 0,
3982                                            vect_body);
3983         }
3984     }
3985   else
3986     {
3987       /* Add in cost for initial definition.
3988          For cond reduction we have four vectors: initial index, step,
3989          initial result of the data reduction, initial value of the index
3990          reduction.  */
3991       int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
3992       prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
3993                                          scalar_to_vec, stmt_info, 0,
3994                                          vect_prologue);
3995
3996       /* Cost of reduction op inside loop.  */
3997       inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
3998                                       stmt_info, 0, vect_body);
3999     }
4000
4001   /* Determine cost of epilogue code.
4002
4003      We have a reduction operator that will reduce the vector in one statement.
4004      Also requires scalar extract.  */
4005
4006   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
4007     {
4008       if (reduc_fn != IFN_LAST)
4009         {
4010           if (reduction_type == COND_REDUCTION)
4011             {
4012               /* An EQ stmt and an COND_EXPR stmt.  */
4013               epilogue_cost += record_stmt_cost (cost_vec, 2,
4014                                                  vector_stmt, stmt_info, 0,
4015                                                  vect_epilogue);
4016               /* Reduction of the max index and a reduction of the found
4017                  values.  */
4018               epilogue_cost += record_stmt_cost (cost_vec, 2,
4019                                                  vec_to_scalar, stmt_info, 0,
4020                                                  vect_epilogue);
4021               /* A broadcast of the max value.  */
4022               epilogue_cost += record_stmt_cost (cost_vec, 1,
4023                                                  scalar_to_vec, stmt_info, 0,
4024                                                  vect_epilogue);
4025             }
4026           else
4027             {
4028               epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
4029                                                  stmt_info, 0, vect_epilogue);
4030               epilogue_cost += record_stmt_cost (cost_vec, 1,
4031                                                  vec_to_scalar, stmt_info, 0,
4032                                                  vect_epilogue);
4033             }
4034         }
4035       else if (reduction_type == COND_REDUCTION)
4036         {
4037           unsigned estimated_nunits = vect_nunits_for_cost (vectype);
4038           /* Extraction of scalar elements.  */
4039           epilogue_cost += record_stmt_cost (cost_vec,
4040                                              2 * estimated_nunits,
4041                                              vec_to_scalar, stmt_info, 0,
4042                                              vect_epilogue);
4043           /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
4044           epilogue_cost += record_stmt_cost (cost_vec,
4045                                              2 * estimated_nunits - 3,
4046                                              scalar_stmt, stmt_info, 0,
4047                                              vect_epilogue);
4048         }
4049       else if (reduction_type == EXTRACT_LAST_REDUCTION
4050                || reduction_type == FOLD_LEFT_REDUCTION)
4051         /* No extra instructions need in the epilogue.  */
4052         ;
4053       else
4054         {
4055           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4056           tree bitsize =
4057             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt_info->stmt)));
4058           int element_bitsize = tree_to_uhwi (bitsize);
4059           int nelements = vec_size_in_bits / element_bitsize;
4060
4061           if (code == COND_EXPR)
4062             code = MAX_EXPR;
4063
4064           optab = optab_for_tree_code (code, vectype, optab_default);
4065
4066           /* We have a whole vector shift available.  */
4067           if (optab != unknown_optab
4068               && VECTOR_MODE_P (mode)
4069               && optab_handler (optab, mode) != CODE_FOR_nothing
4070               && have_whole_vector_shift (mode))
4071             {
4072               /* Final reduction via vector shifts and the reduction operator.
4073                  Also requires scalar extract.  */
4074               epilogue_cost += record_stmt_cost (cost_vec,
4075                                                  exact_log2 (nelements) * 2,
4076                                                  vector_stmt, stmt_info, 0,
4077                                                  vect_epilogue);
4078               epilogue_cost += record_stmt_cost (cost_vec, 1,
4079                                                  vec_to_scalar, stmt_info, 0,
4080                                                  vect_epilogue);
4081             }
4082           else
4083             /* Use extracts and reduction op for final reduction.  For N
4084                elements, we have N extracts and N-1 reduction ops.  */
4085             epilogue_cost += record_stmt_cost (cost_vec,
4086                                                nelements + nelements - 1,
4087                                                vector_stmt, stmt_info, 0,
4088                                                vect_epilogue);
4089         }
4090     }
4091
4092   if (dump_enabled_p ())
4093     dump_printf (MSG_NOTE,
4094                  "vect_model_reduction_cost: inside_cost = %d, "
4095                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4096                  prologue_cost, epilogue_cost);
4097 }
4098
4099
4100 /* Function vect_model_induction_cost.
4101
4102    Models cost for induction operations.  */
4103
4104 static void
4105 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies,
4106                            stmt_vector_for_cost *cost_vec)
4107 {
4108   unsigned inside_cost, prologue_cost;
4109
4110   if (PURE_SLP_STMT (stmt_info))
4111     return;
4112
4113   /* loop cost for vec_loop.  */
4114   inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
4115                                   stmt_info, 0, vect_body);
4116
4117   /* prologue cost for vec_init and vec_step.  */
4118   prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
4119                                     stmt_info, 0, vect_prologue);
4120
4121   if (dump_enabled_p ())
4122     dump_printf_loc (MSG_NOTE, vect_location,
4123                      "vect_model_induction_cost: inside_cost = %d, "
4124                      "prologue_cost = %d .\n", inside_cost, prologue_cost);
4125 }
4126
4127
4128
4129 /* Function get_initial_def_for_reduction
4130
4131    Input:
4132    STMT_VINFO - a stmt that performs a reduction operation in the loop.
4133    INIT_VAL - the initial value of the reduction variable
4134
4135    Output:
4136    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4137         of the reduction (used for adjusting the epilog - see below).
4138    Return a vector variable, initialized according to the operation that
4139         STMT_VINFO performs. This vector will be used as the initial value
4140         of the vector of partial results.
4141
4142    Option1 (adjust in epilog): Initialize the vector as follows:
4143      add/bit or/xor:    [0,0,...,0,0]
4144      mult/bit and:      [1,1,...,1,1]
4145      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4146    and when necessary (e.g. add/mult case) let the caller know
4147    that it needs to adjust the result by init_val.
4148
4149    Option2: Initialize the vector as follows:
4150      add/bit or/xor:    [init_val,0,0,...,0]
4151      mult/bit and:      [init_val,1,1,...,1]
4152      min/max/cond_expr: [init_val,init_val,...,init_val]
4153    and no adjustments are needed.
4154
4155    For example, for the following code:
4156
4157    s = init_val;
4158    for (i=0;i<n;i++)
4159      s = s + a[i];
4160
4161    STMT_VINFO is 's = s + a[i]', and the reduction variable is 's'.
4162    For a vector of 4 units, we want to return either [0,0,0,init_val],
4163    or [0,0,0,0] and let the caller know that it needs to adjust
4164    the result at the end by 'init_val'.
4165
4166    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4167    initialization vector is simpler (same element in all entries), if
4168    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4169
4170    A cost model should help decide between these two schemes.  */
4171
4172 tree
4173 get_initial_def_for_reduction (stmt_vec_info stmt_vinfo, tree init_val,
4174                                tree *adjustment_def)
4175 {
4176   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
4177   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4178   tree scalar_type = TREE_TYPE (init_val);
4179   tree vectype = get_vectype_for_scalar_type (scalar_type);
4180   enum tree_code code = gimple_assign_rhs_code (stmt_vinfo->stmt);
4181   tree def_for_init;
4182   tree init_def;
4183   REAL_VALUE_TYPE real_init_val = dconst0;
4184   int int_init_val = 0;
4185   gimple_seq stmts = NULL;
4186
4187   gcc_assert (vectype);
4188
4189   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4190               || SCALAR_FLOAT_TYPE_P (scalar_type));
4191
4192   gcc_assert (nested_in_vect_loop_p (loop, stmt_vinfo)
4193               || loop == (gimple_bb (stmt_vinfo->stmt))->loop_father);
4194
4195   vect_reduction_type reduction_type
4196     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo);
4197
4198   switch (code)
4199     {
4200     case WIDEN_SUM_EXPR:
4201     case DOT_PROD_EXPR:
4202     case SAD_EXPR:
4203     case PLUS_EXPR:
4204     case MINUS_EXPR:
4205     case BIT_IOR_EXPR:
4206     case BIT_XOR_EXPR:
4207     case MULT_EXPR:
4208     case BIT_AND_EXPR:
4209       {
4210         /* ADJUSTMENT_DEF is NULL when called from
4211            vect_create_epilog_for_reduction to vectorize double reduction.  */
4212         if (adjustment_def)
4213           *adjustment_def = init_val;
4214
4215         if (code == MULT_EXPR)
4216           {
4217             real_init_val = dconst1;
4218             int_init_val = 1;
4219           }
4220
4221         if (code == BIT_AND_EXPR)
4222           int_init_val = -1;
4223
4224         if (SCALAR_FLOAT_TYPE_P (scalar_type))
4225           def_for_init = build_real (scalar_type, real_init_val);
4226         else
4227           def_for_init = build_int_cst (scalar_type, int_init_val);
4228
4229         if (adjustment_def)
4230           /* Option1: the first element is '0' or '1' as well.  */
4231           init_def = gimple_build_vector_from_val (&stmts, vectype,
4232                                                    def_for_init);
4233         else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4234           {
4235             /* Option2 (variable length): the first element is INIT_VAL.  */
4236             init_def = gimple_build_vector_from_val (&stmts, vectype,
4237                                                      def_for_init);
4238             init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4239                                      vectype, init_def, init_val);
4240           }
4241         else
4242           {
4243             /* Option2: the first element is INIT_VAL.  */
4244             tree_vector_builder elts (vectype, 1, 2);
4245             elts.quick_push (init_val);
4246             elts.quick_push (def_for_init);
4247             init_def = gimple_build_vector (&stmts, &elts);
4248           }
4249       }
4250       break;
4251
4252     case MIN_EXPR:
4253     case MAX_EXPR:
4254     case COND_EXPR:
4255       {
4256         if (adjustment_def)
4257           {
4258             *adjustment_def = NULL_TREE;
4259             if (reduction_type != COND_REDUCTION
4260                 && reduction_type != EXTRACT_LAST_REDUCTION)
4261               {
4262                 init_def = vect_get_vec_def_for_operand (init_val, stmt_vinfo);
4263                 break;
4264               }
4265           }
4266         init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4267         init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4268       }
4269       break;
4270
4271     default:
4272       gcc_unreachable ();
4273     }
4274
4275   if (stmts)
4276     gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4277   return init_def;
4278 }
4279
4280 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4281    NUMBER_OF_VECTORS is the number of vector defs to create.
4282    If NEUTRAL_OP is nonnull, introducing extra elements of that
4283    value will not change the result.  */
4284
4285 static void
4286 get_initial_defs_for_reduction (slp_tree slp_node,
4287                                 vec<tree> *vec_oprnds,
4288                                 unsigned int number_of_vectors,
4289                                 bool reduc_chain, tree neutral_op)
4290 {
4291   vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4292   stmt_vec_info stmt_vinfo = stmts[0];
4293   unsigned HOST_WIDE_INT nunits;
4294   unsigned j, number_of_places_left_in_vector;
4295   tree vector_type;
4296   unsigned int group_size = stmts.length ();
4297   unsigned int i;
4298   struct loop *loop;
4299
4300   vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4301
4302   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4303
4304   loop = (gimple_bb (stmt_vinfo->stmt))->loop_father;
4305   gcc_assert (loop);
4306   edge pe = loop_preheader_edge (loop);
4307
4308   gcc_assert (!reduc_chain || neutral_op);
4309
4310   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4311      created vectors. It is greater than 1 if unrolling is performed.
4312
4313      For example, we have two scalar operands, s1 and s2 (e.g., group of
4314      strided accesses of size two), while NUNITS is four (i.e., four scalars
4315      of this type can be packed in a vector).  The output vector will contain
4316      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
4317      will be 2).
4318
4319      If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4320      vectors containing the operands.
4321
4322      For example, NUNITS is four as before, and the group size is 8
4323      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
4324      {s5, s6, s7, s8}.  */
4325
4326   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4327     nunits = group_size;
4328
4329   number_of_places_left_in_vector = nunits;
4330   bool constant_p = true;
4331   tree_vector_builder elts (vector_type, nunits, 1);
4332   elts.quick_grow (nunits);
4333   gimple_seq ctor_seq = NULL;
4334   for (j = 0; j < nunits * number_of_vectors; ++j)
4335     {
4336       tree op;
4337       i = j % group_size;
4338       stmt_vinfo = stmts[i];
4339
4340       /* Get the def before the loop.  In reduction chain we have only
4341          one initial value.  Else we have as many as PHIs in the group.  */
4342       if (reduc_chain)
4343         op = j != 0 ? neutral_op : PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4344       else if (((vec_oprnds->length () + 1) * nunits
4345                 - number_of_places_left_in_vector >= group_size)
4346                && neutral_op)
4347         op = neutral_op;
4348       else
4349         op = PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4350
4351       /* Create 'vect_ = {op0,op1,...,opn}'.  */
4352       number_of_places_left_in_vector--;
4353       elts[nunits - number_of_places_left_in_vector - 1] = op;
4354       if (!CONSTANT_CLASS_P (op))
4355         constant_p = false;
4356
4357       if (number_of_places_left_in_vector == 0)
4358         {
4359           tree init;
4360           if (constant_p && !neutral_op
4361               ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4362               : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4363             /* Build the vector directly from ELTS.  */
4364             init = gimple_build_vector (&ctor_seq, &elts);
4365           else if (neutral_op)
4366             {
4367               /* Build a vector of the neutral value and shift the
4368                  other elements into place.  */
4369               init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4370                                                    neutral_op);
4371               int k = nunits;
4372               while (k > 0 && elts[k - 1] == neutral_op)
4373                 k -= 1;
4374               while (k > 0)
4375                 {
4376                   k -= 1;
4377                   init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4378                                        vector_type, init, elts[k]);
4379                 }
4380             }
4381           else
4382             {
4383               /* First time round, duplicate ELTS to fill the
4384                  required number of vectors.  */
4385               duplicate_and_interleave (&ctor_seq, vector_type, elts,
4386                                         number_of_vectors, *vec_oprnds);
4387               break;
4388             }
4389           vec_oprnds->quick_push (init);
4390
4391           number_of_places_left_in_vector = nunits;
4392           elts.new_vector (vector_type, nunits, 1);
4393           elts.quick_grow (nunits);
4394           constant_p = true;
4395         }
4396     }
4397   if (ctor_seq != NULL)
4398     gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4399 }
4400
4401
4402 /* Function vect_create_epilog_for_reduction
4403
4404    Create code at the loop-epilog to finalize the result of a reduction
4405    computation.
4406
4407    VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4408      reduction statements.
4409    STMT_INFO is the scalar reduction stmt that is being vectorized.
4410    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4411      number of elements that we can fit in a vectype (nunits).  In this case
4412      we have to generate more than one vector stmt - i.e - we need to "unroll"
4413      the vector stmt by a factor VF/nunits.  For more details see documentation
4414      in vectorizable_operation.
4415    REDUC_FN is the internal function for the epilog reduction.
4416    REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4417      computation.
4418    REDUC_INDEX is the index of the operand in the right hand side of the
4419      statement that is defined by REDUCTION_PHI.
4420    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4421    SLP_NODE is an SLP node containing a group of reduction statements. The
4422      first one in this group is STMT_INFO.
4423    INDUC_VAL is for INTEGER_INDUC_COND_REDUCTION the value to use for the case
4424      when the COND_EXPR is never true in the loop.  For MAX_EXPR, it needs to
4425      be smaller than any value of the IV in the loop, for MIN_EXPR larger than
4426      any value of the IV in the loop.
4427    INDUC_CODE is the code for epilog reduction if INTEGER_INDUC_COND_REDUCTION.
4428    NEUTRAL_OP is the value given by neutral_op_for_slp_reduction; it is
4429      null if this is not an SLP reduction
4430
4431    This function:
4432    1. Creates the reduction def-use cycles: sets the arguments for
4433       REDUCTION_PHIS:
4434       The loop-entry argument is the vectorized initial-value of the reduction.
4435       The loop-latch argument is taken from VECT_DEFS - the vector of partial
4436       sums.
4437    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4438       by calling the function specified by REDUC_FN if available, or by
4439       other means (whole-vector shifts or a scalar loop).
4440       The function also creates a new phi node at the loop exit to preserve
4441       loop-closed form, as illustrated below.
4442
4443      The flow at the entry to this function:
4444
4445         loop:
4446           vec_def = phi <null, null>            # REDUCTION_PHI
4447           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
4448           s_loop = scalar_stmt                  # (scalar) STMT_INFO
4449         loop_exit:
4450           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4451           use <s_out0>
4452           use <s_out0>
4453
4454      The above is transformed by this function into:
4455
4456         loop:
4457           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4458           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
4459           s_loop = scalar_stmt                  # (scalar) STMT_INFO
4460         loop_exit:
4461           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4462           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4463           v_out2 = reduce <v_out1>
4464           s_out3 = extract_field <v_out2, 0>
4465           s_out4 = adjust_result <s_out3>
4466           use <s_out4>
4467           use <s_out4>
4468 */
4469
4470 static void
4471 vect_create_epilog_for_reduction (vec<tree> vect_defs,
4472                                   stmt_vec_info stmt_info,
4473                                   gimple *reduc_def_stmt,
4474                                   int ncopies, internal_fn reduc_fn,
4475                                   vec<stmt_vec_info> reduction_phis,
4476                                   bool double_reduc,
4477                                   slp_tree slp_node,
4478                                   slp_instance slp_node_instance,
4479                                   tree induc_val, enum tree_code induc_code,
4480                                   tree neutral_op)
4481 {
4482   stmt_vec_info prev_phi_info;
4483   tree vectype;
4484   machine_mode mode;
4485   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4486   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4487   basic_block exit_bb;
4488   tree scalar_dest;
4489   tree scalar_type;
4490   gimple *new_phi = NULL, *phi;
4491   stmt_vec_info phi_info;
4492   gimple_stmt_iterator exit_gsi;
4493   tree vec_dest;
4494   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4495   gimple *epilog_stmt = NULL;
4496   enum tree_code code = gimple_assign_rhs_code (stmt_info->stmt);
4497   gimple *exit_phi;
4498   tree bitsize;
4499   tree adjustment_def = NULL;
4500   tree vec_initial_def = NULL;
4501   tree expr, def, initial_def = NULL;
4502   tree orig_name, scalar_result;
4503   imm_use_iterator imm_iter, phi_imm_iter;
4504   use_operand_p use_p, phi_use_p;
4505   gimple *use_stmt;
4506   stmt_vec_info reduction_phi_info = NULL;
4507   bool nested_in_vect_loop = false;
4508   auto_vec<gimple *> new_phis;
4509   auto_vec<stmt_vec_info> inner_phis;
4510   int j, i;
4511   auto_vec<tree> scalar_results;
4512   unsigned int group_size = 1, k, ratio;
4513   auto_vec<tree> vec_initial_defs;
4514   auto_vec<gimple *> phis;
4515   bool slp_reduc = false;
4516   bool direct_slp_reduc;
4517   tree new_phi_result;
4518   stmt_vec_info inner_phi = NULL;
4519   tree induction_index = NULL_TREE;
4520
4521   if (slp_node)
4522     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4523
4524   if (nested_in_vect_loop_p (loop, stmt_info))
4525     {
4526       outer_loop = loop;
4527       loop = loop->inner;
4528       nested_in_vect_loop = true;
4529       gcc_assert (!slp_node);
4530     }
4531
4532   vectype = STMT_VINFO_VECTYPE (stmt_info);
4533   gcc_assert (vectype);
4534   mode = TYPE_MODE (vectype);
4535
4536   /* 1. Create the reduction def-use cycle:
4537      Set the arguments of REDUCTION_PHIS, i.e., transform
4538
4539         loop:
4540           vec_def = phi <null, null>            # REDUCTION_PHI
4541           VECT_DEF = vector_stmt                # vectorized form of STMT
4542           ...
4543
4544      into:
4545
4546         loop:
4547           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4548           VECT_DEF = vector_stmt                # vectorized form of STMT
4549           ...
4550
4551      (in case of SLP, do it for all the phis). */
4552
4553   /* Get the loop-entry arguments.  */
4554   enum vect_def_type initial_def_dt = vect_unknown_def_type;
4555   if (slp_node)
4556     {
4557       unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4558       vec_initial_defs.reserve (vec_num);
4559       get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
4560                                       &vec_initial_defs, vec_num,
4561                                       REDUC_GROUP_FIRST_ELEMENT (stmt_info),
4562                                       neutral_op);
4563     }
4564   else
4565     {
4566       /* Get at the scalar def before the loop, that defines the initial value
4567          of the reduction variable.  */
4568       initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4569                                            loop_preheader_edge (loop));
4570       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
4571          and we can't use zero for induc_val, use initial_def.  Similarly
4572          for REDUC_MIN and initial_def larger than the base.  */
4573       if (TREE_CODE (initial_def) == INTEGER_CST
4574           && (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4575               == INTEGER_INDUC_COND_REDUCTION)
4576           && !integer_zerop (induc_val)
4577           && ((induc_code == MAX_EXPR
4578                && tree_int_cst_lt (initial_def, induc_val))
4579               || (induc_code == MIN_EXPR
4580                   && tree_int_cst_lt (induc_val, initial_def))))
4581         induc_val = initial_def;
4582
4583       if (double_reduc)
4584         /* In case of double reduction we only create a vector variable
4585            to be put in the reduction phi node.  The actual statement
4586            creation is done later in this function.  */
4587         vec_initial_def = vect_create_destination_var (initial_def, vectype);
4588       else if (nested_in_vect_loop)
4589         {
4590           /* Do not use an adjustment def as that case is not supported
4591              correctly if ncopies is not one.  */
4592           vect_is_simple_use (initial_def, loop_vinfo, &initial_def_dt);
4593           vec_initial_def = vect_get_vec_def_for_operand (initial_def,
4594                                                           stmt_info);
4595         }
4596       else
4597         vec_initial_def
4598           = get_initial_def_for_reduction (stmt_info, initial_def,
4599                                            &adjustment_def);
4600       vec_initial_defs.create (1);
4601       vec_initial_defs.quick_push (vec_initial_def);
4602     }
4603
4604   /* Set phi nodes arguments.  */
4605   FOR_EACH_VEC_ELT (reduction_phis, i, phi_info)
4606     {
4607       tree vec_init_def = vec_initial_defs[i];
4608       tree def = vect_defs[i];
4609       for (j = 0; j < ncopies; j++)
4610         {
4611           if (j != 0)
4612             {
4613               phi_info = STMT_VINFO_RELATED_STMT (phi_info);
4614               if (nested_in_vect_loop)
4615                 vec_init_def
4616                   = vect_get_vec_def_for_stmt_copy (loop_vinfo, vec_init_def);
4617             }
4618
4619           /* Set the loop-entry arg of the reduction-phi.  */
4620
4621           gphi *phi = as_a <gphi *> (phi_info->stmt);
4622           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4623               == INTEGER_INDUC_COND_REDUCTION)
4624             {
4625               /* Initialise the reduction phi to zero.  This prevents initial
4626                  values of non-zero interferring with the reduction op.  */
4627               gcc_assert (ncopies == 1);
4628               gcc_assert (i == 0);
4629
4630               tree vec_init_def_type = TREE_TYPE (vec_init_def);
4631               tree induc_val_vec
4632                 = build_vector_from_val (vec_init_def_type, induc_val);
4633
4634               add_phi_arg (phi, induc_val_vec, loop_preheader_edge (loop),
4635                            UNKNOWN_LOCATION);
4636             }
4637           else
4638             add_phi_arg (phi, vec_init_def, loop_preheader_edge (loop),
4639                          UNKNOWN_LOCATION);
4640
4641           /* Set the loop-latch arg for the reduction-phi.  */
4642           if (j > 0)
4643             def = vect_get_vec_def_for_stmt_copy (loop_vinfo, def);
4644
4645           add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
4646
4647           if (dump_enabled_p ())
4648             dump_printf_loc (MSG_NOTE, vect_location,
4649                              "transform reduction: created def-use cycle: %G%G",
4650                              phi, SSA_NAME_DEF_STMT (def));
4651         }
4652     }
4653
4654   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4655      which is updated with the current index of the loop for every match of
4656      the original loop's cond_expr (VEC_STMT).  This results in a vector
4657      containing the last time the condition passed for that vector lane.
4658      The first match will be a 1 to allow 0 to be used for non-matching
4659      indexes.  If there are no matches at all then the vector will be all
4660      zeroes.  */
4661   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4662     {
4663       tree indx_before_incr, indx_after_incr;
4664       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4665
4666       gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info)->stmt;
4667       gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4668
4669       int scalar_precision
4670         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4671       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4672       tree cr_index_vector_type = build_vector_type
4673         (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype));
4674
4675       /* First we create a simple vector induction variable which starts
4676          with the values {1,2,3,...} (SERIES_VECT) and increments by the
4677          vector size (STEP).  */
4678
4679       /* Create a {1,2,3,...} vector.  */
4680       tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
4681
4682       /* Create a vector of the step value.  */
4683       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4684       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4685
4686       /* Create an induction variable.  */
4687       gimple_stmt_iterator incr_gsi;
4688       bool insert_after;
4689       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4690       create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4691                  insert_after, &indx_before_incr, &indx_after_incr);
4692
4693       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4694          filled with zeros (VEC_ZERO).  */
4695
4696       /* Create a vector of 0s.  */
4697       tree zero = build_zero_cst (cr_index_scalar_type);
4698       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4699
4700       /* Create a vector phi node.  */
4701       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4702       new_phi = create_phi_node (new_phi_tree, loop->header);
4703       loop_vinfo->add_stmt (new_phi);
4704       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4705                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
4706
4707       /* Now take the condition from the loops original cond_expr
4708          (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
4709          every match uses values from the induction variable
4710          (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4711          (NEW_PHI_TREE).
4712          Finally, we update the phi (NEW_PHI_TREE) to take the value of
4713          the new cond_expr (INDEX_COND_EXPR).  */
4714
4715       /* Duplicate the condition from vec_stmt.  */
4716       tree ccompare = unshare_expr (gimple_assign_rhs1 (vec_stmt));
4717
4718       /* Create a conditional, where the condition is taken from vec_stmt
4719          (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and
4720          else is the phi (NEW_PHI_TREE).  */
4721       tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
4722                                      ccompare, indx_before_incr,
4723                                      new_phi_tree);
4724       induction_index = make_ssa_name (cr_index_vector_type);
4725       gimple *index_condition = gimple_build_assign (induction_index,
4726                                                      index_cond_expr);
4727       gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
4728       stmt_vec_info index_vec_info = loop_vinfo->add_stmt (index_condition);
4729       STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
4730
4731       /* Update the phi with the vec cond.  */
4732       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4733                    loop_latch_edge (loop), UNKNOWN_LOCATION);
4734     }
4735
4736   /* 2. Create epilog code.
4737         The reduction epilog code operates across the elements of the vector
4738         of partial results computed by the vectorized loop.
4739         The reduction epilog code consists of:
4740
4741         step 1: compute the scalar result in a vector (v_out2)
4742         step 2: extract the scalar result (s_out3) from the vector (v_out2)
4743         step 3: adjust the scalar result (s_out3) if needed.
4744
4745         Step 1 can be accomplished using one the following three schemes:
4746           (scheme 1) using reduc_fn, if available.
4747           (scheme 2) using whole-vector shifts, if available.
4748           (scheme 3) using a scalar loop. In this case steps 1+2 above are
4749                      combined.
4750
4751           The overall epilog code looks like this:
4752
4753           s_out0 = phi <s_loop>         # original EXIT_PHI
4754           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
4755           v_out2 = reduce <v_out1>              # step 1
4756           s_out3 = extract_field <v_out2, 0>    # step 2
4757           s_out4 = adjust_result <s_out3>       # step 3
4758
4759           (step 3 is optional, and steps 1 and 2 may be combined).
4760           Lastly, the uses of s_out0 are replaced by s_out4.  */
4761
4762
4763   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4764          v_out1 = phi <VECT_DEF>
4765          Store them in NEW_PHIS.  */
4766
4767   exit_bb = single_exit (loop)->dest;
4768   prev_phi_info = NULL;
4769   new_phis.create (vect_defs.length ());
4770   FOR_EACH_VEC_ELT (vect_defs, i, def)
4771     {
4772       for (j = 0; j < ncopies; j++)
4773         {
4774           tree new_def = copy_ssa_name (def);
4775           phi = create_phi_node (new_def, exit_bb);
4776           stmt_vec_info phi_info = loop_vinfo->add_stmt (phi);
4777           if (j == 0)
4778             new_phis.quick_push (phi);
4779           else
4780             {
4781               def = vect_get_vec_def_for_stmt_copy (loop_vinfo, def);
4782               STMT_VINFO_RELATED_STMT (prev_phi_info) = phi_info;
4783             }
4784
4785           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4786           prev_phi_info = phi_info;
4787         }
4788     }
4789
4790   /* The epilogue is created for the outer-loop, i.e., for the loop being
4791      vectorized.  Create exit phis for the outer loop.  */
4792   if (double_reduc)
4793     {
4794       loop = outer_loop;
4795       exit_bb = single_exit (loop)->dest;
4796       inner_phis.create (vect_defs.length ());
4797       FOR_EACH_VEC_ELT (new_phis, i, phi)
4798         {
4799           stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
4800           tree new_result = copy_ssa_name (PHI_RESULT (phi));
4801           gphi *outer_phi = create_phi_node (new_result, exit_bb);
4802           SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4803                            PHI_RESULT (phi));
4804           prev_phi_info = loop_vinfo->add_stmt (outer_phi);
4805           inner_phis.quick_push (phi_info);
4806           new_phis[i] = outer_phi;
4807           while (STMT_VINFO_RELATED_STMT (phi_info))
4808             {
4809               phi_info = STMT_VINFO_RELATED_STMT (phi_info);
4810               new_result = copy_ssa_name (PHI_RESULT (phi_info->stmt));
4811               outer_phi = create_phi_node (new_result, exit_bb);
4812               SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4813                                PHI_RESULT (phi_info->stmt));
4814               stmt_vec_info outer_phi_info = loop_vinfo->add_stmt (outer_phi);
4815               STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi_info;
4816               prev_phi_info = outer_phi_info;
4817             }
4818         }
4819     }
4820
4821   exit_gsi = gsi_after_labels (exit_bb);
4822
4823   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4824          (i.e. when reduc_fn is not available) and in the final adjustment
4825          code (if needed).  Also get the original scalar reduction variable as
4826          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
4827          represents a reduction pattern), the tree-code and scalar-def are
4828          taken from the original stmt that the pattern-stmt (STMT) replaces.
4829          Otherwise (it is a regular reduction) - the tree-code and scalar-def
4830          are taken from STMT.  */
4831
4832   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4833   if (orig_stmt_info != stmt_info)
4834     {
4835       /* Reduction pattern  */
4836       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4837       gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
4838     }
4839
4840   code = gimple_assign_rhs_code (orig_stmt_info->stmt);
4841   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4842      partial results are added and not subtracted.  */
4843   if (code == MINUS_EXPR)
4844     code = PLUS_EXPR;
4845
4846   scalar_dest = gimple_assign_lhs (orig_stmt_info->stmt);
4847   scalar_type = TREE_TYPE (scalar_dest);
4848   scalar_results.create (group_size);
4849   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4850   bitsize = TYPE_SIZE (scalar_type);
4851
4852   /* In case this is a reduction in an inner-loop while vectorizing an outer
4853      loop - we don't need to extract a single scalar result at the end of the
4854      inner-loop (unless it is double reduction, i.e., the use of reduction is
4855      outside the outer-loop).  The final vector of partial results will be used
4856      in the vectorized outer-loop, or reduced to a scalar result at the end of
4857      the outer-loop.  */
4858   if (nested_in_vect_loop && !double_reduc)
4859     goto vect_finalize_reduction;
4860
4861   /* SLP reduction without reduction chain, e.g.,
4862      # a1 = phi <a2, a0>
4863      # b1 = phi <b2, b0>
4864      a2 = operation (a1)
4865      b2 = operation (b1)  */
4866   slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
4867
4868   /* True if we should implement SLP_REDUC using native reduction operations
4869      instead of scalar operations.  */
4870   direct_slp_reduc = (reduc_fn != IFN_LAST
4871                       && slp_reduc
4872                       && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
4873
4874   /* In case of reduction chain, e.g.,
4875      # a1 = phi <a3, a0>
4876      a2 = operation (a1)
4877      a3 = operation (a2),
4878
4879      we may end up with more than one vector result.  Here we reduce them to
4880      one vector.  */
4881   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) || direct_slp_reduc)
4882     {
4883       tree first_vect = PHI_RESULT (new_phis[0]);
4884       gassign *new_vec_stmt = NULL;
4885       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4886       for (k = 1; k < new_phis.length (); k++)
4887         {
4888           gimple *next_phi = new_phis[k];
4889           tree second_vect = PHI_RESULT (next_phi);
4890           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4891           new_vec_stmt = gimple_build_assign (tem, code,
4892                                               first_vect, second_vect);
4893           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4894           first_vect = tem;
4895         }
4896
4897       new_phi_result = first_vect;
4898       if (new_vec_stmt)
4899         {
4900           new_phis.truncate (0);
4901           new_phis.safe_push (new_vec_stmt);
4902         }
4903     }
4904   /* Likewise if we couldn't use a single defuse cycle.  */
4905   else if (ncopies > 1)
4906     {
4907       gcc_assert (new_phis.length () == 1);
4908       tree first_vect = PHI_RESULT (new_phis[0]);
4909       gassign *new_vec_stmt = NULL;
4910       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4911       stmt_vec_info next_phi_info = loop_vinfo->lookup_stmt (new_phis[0]);
4912       for (int k = 1; k < ncopies; ++k)
4913         {
4914           next_phi_info = STMT_VINFO_RELATED_STMT (next_phi_info);
4915           tree second_vect = PHI_RESULT (next_phi_info->stmt);
4916           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4917           new_vec_stmt = gimple_build_assign (tem, code,
4918                                               first_vect, second_vect);
4919           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4920           first_vect = tem;
4921         }
4922       new_phi_result = first_vect;
4923       new_phis.truncate (0);
4924       new_phis.safe_push (new_vec_stmt);
4925     }
4926   else
4927     new_phi_result = PHI_RESULT (new_phis[0]);
4928
4929   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4930       && reduc_fn != IFN_LAST)
4931     {
4932       /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4933          various data values where the condition matched and another vector
4934          (INDUCTION_INDEX) containing all the indexes of those matches.  We
4935          need to extract the last matching index (which will be the index with
4936          highest value) and use this to index into the data vector.
4937          For the case where there were no matches, the data vector will contain
4938          all default values and the index vector will be all zeros.  */
4939
4940       /* Get various versions of the type of the vector of indexes.  */
4941       tree index_vec_type = TREE_TYPE (induction_index);
4942       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4943       tree index_scalar_type = TREE_TYPE (index_vec_type);
4944       tree index_vec_cmp_type = build_same_sized_truth_vector_type
4945         (index_vec_type);
4946
4947       /* Get an unsigned integer version of the type of the data vector.  */
4948       int scalar_precision
4949         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
4950       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4951       tree vectype_unsigned = build_vector_type
4952         (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4953
4954       /* First we need to create a vector (ZERO_VEC) of zeros and another
4955          vector (MAX_INDEX_VEC) filled with the last matching index, which we
4956          can create using a MAX reduction and then expanding.
4957          In the case where the loop never made any matches, the max index will
4958          be zero.  */
4959
4960       /* Vector of {0, 0, 0,...}.  */
4961       tree zero_vec = make_ssa_name (vectype);
4962       tree zero_vec_rhs = build_zero_cst (vectype);
4963       gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
4964       gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
4965
4966       /* Find maximum value from the vector of found indexes.  */
4967       tree max_index = make_ssa_name (index_scalar_type);
4968       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4969                                                           1, induction_index);
4970       gimple_call_set_lhs (max_index_stmt, max_index);
4971       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
4972
4973       /* Vector of {max_index, max_index, max_index,...}.  */
4974       tree max_index_vec = make_ssa_name (index_vec_type);
4975       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
4976                                                       max_index);
4977       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
4978                                                         max_index_vec_rhs);
4979       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
4980
4981       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
4982          with the vector (INDUCTION_INDEX) of found indexes, choosing values
4983          from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
4984          otherwise.  Only one value should match, resulting in a vector
4985          (VEC_COND) with one data value and the rest zeros.
4986          In the case where the loop never made any matches, every index will
4987          match, resulting in a vector with all data values (which will all be
4988          the default value).  */
4989
4990       /* Compare the max index vector to the vector of found indexes to find
4991          the position of the max value.  */
4992       tree vec_compare = make_ssa_name (index_vec_cmp_type);
4993       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
4994                                                       induction_index,
4995                                                       max_index_vec);
4996       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
4997
4998       /* Use the compare to choose either values from the data vector or
4999          zero.  */
5000       tree vec_cond = make_ssa_name (vectype);
5001       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
5002                                                    vec_compare, new_phi_result,
5003                                                    zero_vec);
5004       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
5005
5006       /* Finally we need to extract the data value from the vector (VEC_COND)
5007          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
5008          reduction, but because this doesn't exist, we can use a MAX reduction
5009          instead.  The data value might be signed or a float so we need to cast
5010          it first.
5011          In the case where the loop never made any matches, the data values are
5012          all identical, and so will reduce down correctly.  */
5013
5014       /* Make the matched data values unsigned.  */
5015       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
5016       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
5017                                        vec_cond);
5018       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
5019                                                         VIEW_CONVERT_EXPR,
5020                                                         vec_cond_cast_rhs);
5021       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
5022
5023       /* Reduce down to a scalar value.  */
5024       tree data_reduc = make_ssa_name (scalar_type_unsigned);
5025       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5026                                                            1, vec_cond_cast);
5027       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
5028       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
5029
5030       /* Convert the reduced value back to the result type and set as the
5031          result.  */
5032       gimple_seq stmts = NULL;
5033       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
5034                                data_reduc);
5035       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5036       scalar_results.safe_push (new_temp);
5037     }
5038   else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
5039            && reduc_fn == IFN_LAST)
5040     {
5041       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
5042          idx = 0;
5043          idx_val = induction_index[0];
5044          val = data_reduc[0];
5045          for (idx = 0, val = init, i = 0; i < nelts; ++i)
5046            if (induction_index[i] > idx_val)
5047              val = data_reduc[i], idx_val = induction_index[i];
5048          return val;  */
5049
5050       tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
5051       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5052       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5053       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5054       /* Enforced by vectorizable_reduction, which ensures we have target
5055          support before allowing a conditional reduction on variable-length
5056          vectors.  */
5057       unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5058       tree idx_val = NULL_TREE, val = NULL_TREE;
5059       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5060         {
5061           tree old_idx_val = idx_val;
5062           tree old_val = val;
5063           idx_val = make_ssa_name (idx_eltype);
5064           epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5065                                              build3 (BIT_FIELD_REF, idx_eltype,
5066                                                      induction_index,
5067                                                      bitsize_int (el_size),
5068                                                      bitsize_int (off)));
5069           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5070           val = make_ssa_name (data_eltype);
5071           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5072                                              build3 (BIT_FIELD_REF,
5073                                                      data_eltype,
5074                                                      new_phi_result,
5075                                                      bitsize_int (el_size),
5076                                                      bitsize_int (off)));
5077           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5078           if (off != 0)
5079             {
5080               tree new_idx_val = idx_val;
5081               tree new_val = val;
5082               if (off != v_size - el_size)
5083                 {
5084                   new_idx_val = make_ssa_name (idx_eltype);
5085                   epilog_stmt = gimple_build_assign (new_idx_val,
5086                                                      MAX_EXPR, idx_val,
5087                                                      old_idx_val);
5088                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5089                 }
5090               new_val = make_ssa_name (data_eltype);
5091               epilog_stmt = gimple_build_assign (new_val,
5092                                                  COND_EXPR,
5093                                                  build2 (GT_EXPR,
5094                                                          boolean_type_node,
5095                                                          idx_val,
5096                                                          old_idx_val),
5097                                                  val, old_val);
5098               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5099               idx_val = new_idx_val;
5100               val = new_val;
5101             }
5102         }
5103       /* Convert the reduced value back to the result type and set as the
5104          result.  */
5105       gimple_seq stmts = NULL;
5106       val = gimple_convert (&stmts, scalar_type, val);
5107       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5108       scalar_results.safe_push (val);
5109     }
5110
5111   /* 2.3 Create the reduction code, using one of the three schemes described
5112          above. In SLP we simply need to extract all the elements from the
5113          vector (without reducing them), so we use scalar shifts.  */
5114   else if (reduc_fn != IFN_LAST && !slp_reduc)
5115     {
5116       tree tmp;
5117       tree vec_elem_type;
5118
5119       /* Case 1:  Create:
5120          v_out2 = reduc_expr <v_out1>  */
5121
5122       if (dump_enabled_p ())
5123         dump_printf_loc (MSG_NOTE, vect_location,
5124                          "Reduce using direct vector reduction.\n");
5125
5126       vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
5127       if (!useless_type_conversion_p (scalar_type, vec_elem_type))
5128         {
5129           tree tmp_dest
5130             = vect_create_destination_var (scalar_dest, vec_elem_type);
5131           epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5132                                                     new_phi_result);
5133           gimple_set_lhs (epilog_stmt, tmp_dest);
5134           new_temp = make_ssa_name (tmp_dest, epilog_stmt);
5135           gimple_set_lhs (epilog_stmt, new_temp);
5136           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5137
5138           epilog_stmt = gimple_build_assign (new_scalar_dest, NOP_EXPR,
5139                                              new_temp);
5140         }
5141       else
5142         {
5143           epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5144                                                     new_phi_result);
5145           gimple_set_lhs (epilog_stmt, new_scalar_dest);
5146         }
5147
5148       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5149       gimple_set_lhs (epilog_stmt, new_temp);
5150       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5151
5152       if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5153            == INTEGER_INDUC_COND_REDUCTION)
5154           && !operand_equal_p (initial_def, induc_val, 0))
5155         {
5156           /* Earlier we set the initial value to be a vector if induc_val
5157              values.  Check the result and if it is induc_val then replace
5158              with the original initial value, unless induc_val is
5159              the same as initial_def already.  */
5160           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5161                                   induc_val);
5162
5163           tmp = make_ssa_name (new_scalar_dest);
5164           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5165                                              initial_def, new_temp);
5166           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5167           new_temp = tmp;
5168         }
5169
5170       scalar_results.safe_push (new_temp);
5171     }
5172   else if (direct_slp_reduc)
5173     {
5174       /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5175          with the elements for other SLP statements replaced with the
5176          neutral value.  We can then do a normal reduction on each vector.  */
5177
5178       /* Enforced by vectorizable_reduction.  */
5179       gcc_assert (new_phis.length () == 1);
5180       gcc_assert (pow2p_hwi (group_size));
5181
5182       slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
5183       vec<stmt_vec_info> orig_phis
5184         = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
5185       gimple_seq seq = NULL;
5186
5187       /* Build a vector {0, 1, 2, ...}, with the same number of elements
5188          and the same element size as VECTYPE.  */
5189       tree index = build_index_vector (vectype, 0, 1);
5190       tree index_type = TREE_TYPE (index);
5191       tree index_elt_type = TREE_TYPE (index_type);
5192       tree mask_type = build_same_sized_truth_vector_type (index_type);
5193
5194       /* Create a vector that, for each element, identifies which of
5195          the REDUC_GROUP_SIZE results should use it.  */
5196       tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5197       index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5198                             build_vector_from_val (index_type, index_mask));
5199
5200       /* Get a neutral vector value.  This is simply a splat of the neutral
5201          scalar value if we have one, otherwise the initial scalar value
5202          is itself a neutral value.  */
5203       tree vector_identity = NULL_TREE;
5204       if (neutral_op)
5205         vector_identity = gimple_build_vector_from_val (&seq, vectype,
5206                                                         neutral_op);
5207       for (unsigned int i = 0; i < group_size; ++i)
5208         {
5209           /* If there's no univeral neutral value, we can use the
5210              initial scalar value from the original PHI.  This is used
5211              for MIN and MAX reduction, for example.  */
5212           if (!neutral_op)
5213             {
5214               tree scalar_value
5215                 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i]->stmt,
5216                                          loop_preheader_edge (loop));
5217               vector_identity = gimple_build_vector_from_val (&seq, vectype,
5218                                                               scalar_value);
5219             }
5220
5221           /* Calculate the equivalent of:
5222
5223              sel[j] = (index[j] == i);
5224
5225              which selects the elements of NEW_PHI_RESULT that should
5226              be included in the result.  */
5227           tree compare_val = build_int_cst (index_elt_type, i);
5228           compare_val = build_vector_from_val (index_type, compare_val);
5229           tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5230                                    index, compare_val);
5231
5232           /* Calculate the equivalent of:
5233
5234              vec = seq ? new_phi_result : vector_identity;
5235
5236              VEC is now suitable for a full vector reduction.  */
5237           tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5238                                    sel, new_phi_result, vector_identity);
5239
5240           /* Do the reduction and convert it to the appropriate type.  */
5241           tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5242                                       TREE_TYPE (vectype), vec);
5243           scalar = gimple_convert (&seq, scalar_type, scalar);
5244           scalar_results.safe_push (scalar);
5245         }
5246       gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5247     }
5248   else
5249     {
5250       bool reduce_with_shift;
5251       tree vec_temp;
5252
5253       /* COND reductions all do the final reduction with MAX_EXPR
5254          or MIN_EXPR.  */
5255       if (code == COND_EXPR)
5256         {
5257           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5258               == INTEGER_INDUC_COND_REDUCTION)
5259             code = induc_code;
5260           else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5261                    == CONST_COND_REDUCTION)
5262             code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
5263           else
5264             code = MAX_EXPR;
5265         }
5266
5267       /* See if the target wants to do the final (shift) reduction
5268          in a vector mode of smaller size and first reduce upper/lower
5269          halves against each other.  */
5270       enum machine_mode mode1 = mode;
5271       tree vectype1 = vectype;
5272       unsigned sz = tree_to_uhwi (TYPE_SIZE_UNIT (vectype));
5273       unsigned sz1 = sz;
5274       if (!slp_reduc
5275           && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5276         sz1 = GET_MODE_SIZE (mode1).to_constant ();
5277
5278       vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz1);
5279       reduce_with_shift = have_whole_vector_shift (mode1);
5280       if (!VECTOR_MODE_P (mode1))
5281         reduce_with_shift = false;
5282       else
5283         {
5284           optab optab = optab_for_tree_code (code, vectype1, optab_default);
5285           if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5286             reduce_with_shift = false;
5287         }
5288
5289       /* First reduce the vector to the desired vector size we should
5290          do shift reduction on by combining upper and lower halves.  */
5291       new_temp = new_phi_result;
5292       while (sz > sz1)
5293         {
5294           gcc_assert (!slp_reduc);
5295           sz /= 2;
5296           vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz);
5297
5298           /* The target has to make sure we support lowpart/highpart
5299              extraction, either via direct vector extract or through
5300              an integer mode punning.  */
5301           tree dst1, dst2;
5302           if (convert_optab_handler (vec_extract_optab,
5303                                      TYPE_MODE (TREE_TYPE (new_temp)),
5304                                      TYPE_MODE (vectype1))
5305               != CODE_FOR_nothing)
5306             {
5307               /* Extract sub-vectors directly once vec_extract becomes
5308                  a conversion optab.  */
5309               dst1 = make_ssa_name (vectype1);
5310               epilog_stmt
5311                   = gimple_build_assign (dst1, BIT_FIELD_REF,
5312                                          build3 (BIT_FIELD_REF, vectype1,
5313                                                  new_temp, TYPE_SIZE (vectype1),
5314                                                  bitsize_int (0)));
5315               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5316               dst2 =  make_ssa_name (vectype1);
5317               epilog_stmt
5318                   = gimple_build_assign (dst2, BIT_FIELD_REF,
5319                                          build3 (BIT_FIELD_REF, vectype1,
5320                                                  new_temp, TYPE_SIZE (vectype1),
5321                                                  bitsize_int (sz * BITS_PER_UNIT)));
5322               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5323             }
5324           else
5325             {
5326               /* Extract via punning to appropriately sized integer mode
5327                  vector.  */
5328               tree eltype = build_nonstandard_integer_type (sz * BITS_PER_UNIT,
5329                                                             1);
5330               tree etype = build_vector_type (eltype, 2);
5331               gcc_assert (convert_optab_handler (vec_extract_optab,
5332                                                  TYPE_MODE (etype),
5333                                                  TYPE_MODE (eltype))
5334                           != CODE_FOR_nothing);
5335               tree tem = make_ssa_name (etype);
5336               epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5337                                                  build1 (VIEW_CONVERT_EXPR,
5338                                                          etype, new_temp));
5339               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5340               new_temp = tem;
5341               tem = make_ssa_name (eltype);
5342               epilog_stmt
5343                   = gimple_build_assign (tem, BIT_FIELD_REF,
5344                                          build3 (BIT_FIELD_REF, eltype,
5345                                                  new_temp, TYPE_SIZE (eltype),
5346                                                  bitsize_int (0)));
5347               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5348               dst1 = make_ssa_name (vectype1);
5349               epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5350                                                  build1 (VIEW_CONVERT_EXPR,
5351                                                          vectype1, tem));
5352               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5353               tem = make_ssa_name (eltype);
5354               epilog_stmt
5355                   = gimple_build_assign (tem, BIT_FIELD_REF,
5356                                          build3 (BIT_FIELD_REF, eltype,
5357                                                  new_temp, TYPE_SIZE (eltype),
5358                                                  bitsize_int (sz * BITS_PER_UNIT)));
5359               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5360               dst2 =  make_ssa_name (vectype1);
5361               epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5362                                                  build1 (VIEW_CONVERT_EXPR,
5363                                                          vectype1, tem));
5364               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5365             }
5366
5367           new_temp = make_ssa_name (vectype1);
5368           epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5369           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5370         }
5371
5372       if (reduce_with_shift && !slp_reduc)
5373         {
5374           int element_bitsize = tree_to_uhwi (bitsize);
5375           /* Enforced by vectorizable_reduction, which disallows SLP reductions
5376              for variable-length vectors and also requires direct target support
5377              for loop reductions.  */
5378           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5379           int nelements = vec_size_in_bits / element_bitsize;
5380           vec_perm_builder sel;
5381           vec_perm_indices indices;
5382
5383           int elt_offset;
5384
5385           tree zero_vec = build_zero_cst (vectype1);
5386           /* Case 2: Create:
5387              for (offset = nelements/2; offset >= 1; offset/=2)
5388                 {
5389                   Create:  va' = vec_shift <va, offset>
5390                   Create:  va = vop <va, va'>
5391                 }  */
5392
5393           tree rhs;
5394
5395           if (dump_enabled_p ())
5396             dump_printf_loc (MSG_NOTE, vect_location,
5397                              "Reduce using vector shifts\n");
5398
5399           mode1 = TYPE_MODE (vectype1);
5400           vec_dest = vect_create_destination_var (scalar_dest, vectype1);
5401           for (elt_offset = nelements / 2;
5402                elt_offset >= 1;
5403                elt_offset /= 2)
5404             {
5405               calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5406               indices.new_vector (sel, 2, nelements);
5407               tree mask = vect_gen_perm_mask_any (vectype1, indices);
5408               epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
5409                                                  new_temp, zero_vec, mask);
5410               new_name = make_ssa_name (vec_dest, epilog_stmt);
5411               gimple_assign_set_lhs (epilog_stmt, new_name);
5412               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5413
5414               epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
5415                                                  new_temp);
5416               new_temp = make_ssa_name (vec_dest, epilog_stmt);
5417               gimple_assign_set_lhs (epilog_stmt, new_temp);
5418               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5419             }
5420
5421           /* 2.4  Extract the final scalar result.  Create:
5422              s_out3 = extract_field <v_out2, bitpos>  */
5423
5424           if (dump_enabled_p ())
5425             dump_printf_loc (MSG_NOTE, vect_location,
5426                              "extract scalar result\n");
5427
5428           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5429                         bitsize, bitsize_zero_node);
5430           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5431           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5432           gimple_assign_set_lhs (epilog_stmt, new_temp);
5433           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5434           scalar_results.safe_push (new_temp);
5435         }
5436       else
5437         {
5438           /* Case 3: Create:
5439              s = extract_field <v_out2, 0>
5440              for (offset = element_size;
5441                   offset < vector_size;
5442                   offset += element_size;)
5443                {
5444                  Create:  s' = extract_field <v_out2, offset>
5445                  Create:  s = op <s, s'>  // For non SLP cases
5446                }  */
5447
5448           if (dump_enabled_p ())
5449             dump_printf_loc (MSG_NOTE, vect_location,
5450                              "Reduce using scalar code.\n");
5451
5452           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5453           int element_bitsize = tree_to_uhwi (bitsize);
5454           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5455             {
5456               int bit_offset;
5457               if (gimple_code (new_phi) == GIMPLE_PHI)
5458                 vec_temp = PHI_RESULT (new_phi);
5459               else
5460                 vec_temp = gimple_assign_lhs (new_phi);
5461               tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
5462                                  bitsize_zero_node);
5463               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5464               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5465               gimple_assign_set_lhs (epilog_stmt, new_temp);
5466               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5467
5468               /* In SLP we don't need to apply reduction operation, so we just
5469                  collect s' values in SCALAR_RESULTS.  */
5470               if (slp_reduc)
5471                 scalar_results.safe_push (new_temp);
5472
5473               for (bit_offset = element_bitsize;
5474                    bit_offset < vec_size_in_bits;
5475                    bit_offset += element_bitsize)
5476                 {
5477                   tree bitpos = bitsize_int (bit_offset);
5478                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
5479                                      bitsize, bitpos);
5480
5481                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5482                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
5483                   gimple_assign_set_lhs (epilog_stmt, new_name);
5484                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5485
5486                   if (slp_reduc)
5487                     {
5488                       /* In SLP we don't need to apply reduction operation, so
5489                          we just collect s' values in SCALAR_RESULTS.  */
5490                       new_temp = new_name;
5491                       scalar_results.safe_push (new_name);
5492                     }
5493                   else
5494                     {
5495                       epilog_stmt = gimple_build_assign (new_scalar_dest, code,
5496                                                          new_name, new_temp);
5497                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5498                       gimple_assign_set_lhs (epilog_stmt, new_temp);
5499                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5500                     }
5501                 }
5502             }
5503
5504           /* The only case where we need to reduce scalar results in SLP, is
5505              unrolling.  If the size of SCALAR_RESULTS is greater than
5506              REDUC_GROUP_SIZE, we reduce them combining elements modulo
5507              REDUC_GROUP_SIZE.  */
5508           if (slp_reduc)
5509             {
5510               tree res, first_res, new_res;
5511               gimple *new_stmt;
5512
5513               /* Reduce multiple scalar results in case of SLP unrolling.  */
5514               for (j = group_size; scalar_results.iterate (j, &res);
5515                    j++)
5516                 {
5517                   first_res = scalar_results[j % group_size];
5518                   new_stmt = gimple_build_assign (new_scalar_dest, code,
5519                                                   first_res, res);
5520                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
5521                   gimple_assign_set_lhs (new_stmt, new_res);
5522                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
5523                   scalar_results[j % group_size] = new_res;
5524                 }
5525             }
5526           else
5527             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
5528             scalar_results.safe_push (new_temp);
5529         }
5530
5531       if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5532            == INTEGER_INDUC_COND_REDUCTION)
5533           && !operand_equal_p (initial_def, induc_val, 0))
5534         {
5535           /* Earlier we set the initial value to be a vector if induc_val
5536              values.  Check the result and if it is induc_val then replace
5537              with the original initial value, unless induc_val is
5538              the same as initial_def already.  */
5539           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5540                                   induc_val);
5541
5542           tree tmp = make_ssa_name (new_scalar_dest);
5543           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5544                                              initial_def, new_temp);
5545           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5546           scalar_results[0] = tmp;
5547         }
5548     }
5549
5550 vect_finalize_reduction:
5551
5552   if (double_reduc)
5553     loop = loop->inner;
5554
5555   /* 2.5 Adjust the final result by the initial value of the reduction
5556          variable. (When such adjustment is not needed, then
5557          'adjustment_def' is zero).  For example, if code is PLUS we create:
5558          new_temp = loop_exit_def + adjustment_def  */
5559
5560   if (adjustment_def)
5561     {
5562       gcc_assert (!slp_reduc);
5563       if (nested_in_vect_loop)
5564         {
5565           new_phi = new_phis[0];
5566           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
5567           expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
5568           new_dest = vect_create_destination_var (scalar_dest, vectype);
5569         }
5570       else
5571         {
5572           new_temp = scalar_results[0];
5573           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5574           expr = build2 (code, scalar_type, new_temp, adjustment_def);
5575           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
5576         }
5577
5578       epilog_stmt = gimple_build_assign (new_dest, expr);
5579       new_temp = make_ssa_name (new_dest, epilog_stmt);
5580       gimple_assign_set_lhs (epilog_stmt, new_temp);
5581       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5582       if (nested_in_vect_loop)
5583         {
5584           stmt_vec_info epilog_stmt_info = loop_vinfo->add_stmt (epilog_stmt);
5585           STMT_VINFO_RELATED_STMT (epilog_stmt_info)
5586             = STMT_VINFO_RELATED_STMT (loop_vinfo->lookup_stmt (new_phi));
5587
5588           if (!double_reduc)
5589             scalar_results.quick_push (new_temp);
5590           else
5591             scalar_results[0] = new_temp;
5592         }
5593       else
5594         scalar_results[0] = new_temp;
5595
5596       new_phis[0] = epilog_stmt;
5597     }
5598
5599   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
5600           phis with new adjusted scalar results, i.e., replace use <s_out0>
5601           with use <s_out4>.
5602
5603      Transform:
5604         loop_exit:
5605           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5606           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5607           v_out2 = reduce <v_out1>
5608           s_out3 = extract_field <v_out2, 0>
5609           s_out4 = adjust_result <s_out3>
5610           use <s_out0>
5611           use <s_out0>
5612
5613      into:
5614
5615         loop_exit:
5616           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5617           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5618           v_out2 = reduce <v_out1>
5619           s_out3 = extract_field <v_out2, 0>
5620           s_out4 = adjust_result <s_out3>
5621           use <s_out4>
5622           use <s_out4> */
5623
5624
5625   /* In SLP reduction chain we reduce vector results into one vector if
5626      necessary, hence we set here REDUC_GROUP_SIZE to 1.  SCALAR_DEST is the
5627      LHS of the last stmt in the reduction chain, since we are looking for
5628      the loop exit phi node.  */
5629   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5630     {
5631       stmt_vec_info dest_stmt_info
5632         = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
5633       scalar_dest = gimple_assign_lhs (dest_stmt_info->stmt);
5634       group_size = 1;
5635     }
5636
5637   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5638      case that REDUC_GROUP_SIZE is greater than vectorization factor).
5639      Therefore, we need to match SCALAR_RESULTS with corresponding statements.
5640      The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results
5641      correspond to the first vector stmt, etc.
5642      (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)).  */
5643   if (group_size > new_phis.length ())
5644     {
5645       ratio = group_size / new_phis.length ();
5646       gcc_assert (!(group_size % new_phis.length ()));
5647     }
5648   else
5649     ratio = 1;
5650
5651   stmt_vec_info epilog_stmt_info = NULL;
5652   for (k = 0; k < group_size; k++)
5653     {
5654       if (k % ratio == 0)
5655         {
5656           epilog_stmt_info = loop_vinfo->lookup_stmt (new_phis[k / ratio]);
5657           reduction_phi_info = reduction_phis[k / ratio];
5658           if (double_reduc)
5659             inner_phi = inner_phis[k / ratio];
5660         }
5661
5662       if (slp_reduc)
5663         {
5664           stmt_vec_info scalar_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5665
5666           orig_stmt_info = STMT_VINFO_RELATED_STMT (scalar_stmt_info);
5667           /* SLP statements can't participate in patterns.  */
5668           gcc_assert (!orig_stmt_info);
5669           scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt);
5670         }
5671
5672       phis.create (3);
5673       /* Find the loop-closed-use at the loop exit of the original scalar
5674          result.  (The reduction result is expected to have two immediate uses -
5675          one at the latch block, and one at the loop exit).  */
5676       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5677         if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
5678             && !is_gimple_debug (USE_STMT (use_p)))
5679           phis.safe_push (USE_STMT (use_p));
5680
5681       /* While we expect to have found an exit_phi because of loop-closed-ssa
5682          form we can end up without one if the scalar cycle is dead.  */
5683
5684       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5685         {
5686           if (outer_loop)
5687             {
5688               stmt_vec_info exit_phi_vinfo
5689                 = loop_vinfo->lookup_stmt (exit_phi);
5690               gphi *vect_phi;
5691
5692               if (double_reduc)
5693                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
5694               else
5695                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt_info;
5696               if (!double_reduc
5697                   || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
5698                       != vect_double_reduction_def)
5699                 continue;
5700
5701               /* Handle double reduction:
5702
5703                  stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
5704                  stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
5705                  stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
5706                  stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
5707
5708                  At that point the regular reduction (stmt2 and stmt3) is
5709                  already vectorized, as well as the exit phi node, stmt4.
5710                  Here we vectorize the phi node of double reduction, stmt1, and
5711                  update all relevant statements.  */
5712
5713               /* Go through all the uses of s2 to find double reduction phi
5714                  node, i.e., stmt1 above.  */
5715               orig_name = PHI_RESULT (exit_phi);
5716               FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5717                 {
5718                   stmt_vec_info use_stmt_vinfo;
5719                   tree vect_phi_init, preheader_arg, vect_phi_res;
5720                   basic_block bb = gimple_bb (use_stmt);
5721
5722                   /* Check that USE_STMT is really double reduction phi
5723                      node.  */
5724                   if (gimple_code (use_stmt) != GIMPLE_PHI
5725                       || gimple_phi_num_args (use_stmt) != 2
5726                       || bb->loop_father != outer_loop)
5727                     continue;
5728                   use_stmt_vinfo = loop_vinfo->lookup_stmt (use_stmt);
5729                   if (!use_stmt_vinfo
5730                       || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
5731                           != vect_double_reduction_def)
5732                     continue;
5733
5734                   /* Create vector phi node for double reduction:
5735                      vs1 = phi <vs0, vs2>
5736                      vs1 was created previously in this function by a call to
5737                        vect_get_vec_def_for_operand and is stored in
5738                        vec_initial_def;
5739                      vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
5740                      vs0 is created here.  */
5741
5742                   /* Create vector phi node.  */
5743                   vect_phi = create_phi_node (vec_initial_def, bb);
5744                   loop_vec_info_for_loop (outer_loop)->add_stmt (vect_phi);
5745
5746                   /* Create vs0 - initial def of the double reduction phi.  */
5747                   preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
5748                                              loop_preheader_edge (outer_loop));
5749                   vect_phi_init = get_initial_def_for_reduction
5750                     (stmt_info, preheader_arg, NULL);
5751
5752                   /* Update phi node arguments with vs0 and vs2.  */
5753                   add_phi_arg (vect_phi, vect_phi_init,
5754                                loop_preheader_edge (outer_loop),
5755                                UNKNOWN_LOCATION);
5756                   add_phi_arg (vect_phi, PHI_RESULT (inner_phi->stmt),
5757                                loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
5758                   if (dump_enabled_p ())
5759                     dump_printf_loc (MSG_NOTE, vect_location,
5760                                      "created double reduction phi node: %G",
5761                                      vect_phi);
5762
5763                   vect_phi_res = PHI_RESULT (vect_phi);
5764
5765                   /* Replace the use, i.e., set the correct vs1 in the regular
5766                      reduction phi node.  FORNOW, NCOPIES is always 1, so the
5767                      loop is redundant.  */
5768                   stmt_vec_info use_info = reduction_phi_info;
5769                   for (j = 0; j < ncopies; j++)
5770                     {
5771                       edge pr_edge = loop_preheader_edge (loop);
5772                       SET_PHI_ARG_DEF (as_a <gphi *> (use_info->stmt),
5773                                        pr_edge->dest_idx, vect_phi_res);
5774                       use_info = STMT_VINFO_RELATED_STMT (use_info);
5775                     }
5776                 }
5777             }
5778         }
5779
5780       phis.release ();
5781       if (nested_in_vect_loop)
5782         {
5783           if (double_reduc)
5784             loop = outer_loop;
5785           else
5786             continue;
5787         }
5788
5789       phis.create (3);
5790       /* Find the loop-closed-use at the loop exit of the original scalar
5791          result.  (The reduction result is expected to have two immediate uses,
5792          one at the latch block, and one at the loop exit).  For double
5793          reductions we are looking for exit phis of the outer loop.  */
5794       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5795         {
5796           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5797             {
5798               if (!is_gimple_debug (USE_STMT (use_p)))
5799                 phis.safe_push (USE_STMT (use_p));
5800             }
5801           else
5802             {
5803               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5804                 {
5805                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
5806
5807                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5808                     {
5809                       if (!flow_bb_inside_loop_p (loop,
5810                                              gimple_bb (USE_STMT (phi_use_p)))
5811                           && !is_gimple_debug (USE_STMT (phi_use_p)))
5812                         phis.safe_push (USE_STMT (phi_use_p));
5813                     }
5814                 }
5815             }
5816         }
5817
5818       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5819         {
5820           /* Replace the uses:  */
5821           orig_name = PHI_RESULT (exit_phi);
5822           scalar_result = scalar_results[k];
5823           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5824             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5825               SET_USE (use_p, scalar_result);
5826         }
5827
5828       phis.release ();
5829     }
5830 }
5831
5832 /* Return a vector of type VECTYPE that is equal to the vector select
5833    operation "MASK ? VEC : IDENTITY".  Insert the select statements
5834    before GSI.  */
5835
5836 static tree
5837 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
5838                      tree vec, tree identity)
5839 {
5840   tree cond = make_temp_ssa_name (vectype, NULL, "cond");
5841   gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
5842                                           mask, vec, identity);
5843   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5844   return cond;
5845 }
5846
5847 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
5848    order, starting with LHS.  Insert the extraction statements before GSI and
5849    associate the new scalar SSA names with variable SCALAR_DEST.
5850    Return the SSA name for the result.  */
5851
5852 static tree
5853 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
5854                        tree_code code, tree lhs, tree vector_rhs)
5855 {
5856   tree vectype = TREE_TYPE (vector_rhs);
5857   tree scalar_type = TREE_TYPE (vectype);
5858   tree bitsize = TYPE_SIZE (scalar_type);
5859   unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5860   unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
5861
5862   for (unsigned HOST_WIDE_INT bit_offset = 0;
5863        bit_offset < vec_size_in_bits;
5864        bit_offset += element_bitsize)
5865     {
5866       tree bitpos = bitsize_int (bit_offset);
5867       tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
5868                          bitsize, bitpos);
5869
5870       gassign *stmt = gimple_build_assign (scalar_dest, rhs);
5871       rhs = make_ssa_name (scalar_dest, stmt);
5872       gimple_assign_set_lhs (stmt, rhs);
5873       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5874
5875       stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
5876       tree new_name = make_ssa_name (scalar_dest, stmt);
5877       gimple_assign_set_lhs (stmt, new_name);
5878       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5879       lhs = new_name;
5880     }
5881   return lhs;
5882 }
5883
5884 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT_INFO is the
5885    statement that sets the live-out value.  REDUC_DEF_STMT is the phi
5886    statement.  CODE is the operation performed by STMT_INFO and OPS are
5887    its scalar operands.  REDUC_INDEX is the index of the operand in
5888    OPS that is set by REDUC_DEF_STMT.  REDUC_FN is the function that
5889    implements in-order reduction, or IFN_LAST if we should open-code it.
5890    VECTYPE_IN is the type of the vector input.  MASKS specifies the masks
5891    that should be used to control the operation in a fully-masked loop.  */
5892
5893 static bool
5894 vectorize_fold_left_reduction (stmt_vec_info stmt_info,
5895                                gimple_stmt_iterator *gsi,
5896                                stmt_vec_info *vec_stmt, slp_tree slp_node,
5897                                gimple *reduc_def_stmt,
5898                                tree_code code, internal_fn reduc_fn,
5899                                tree ops[3], tree vectype_in,
5900                                int reduc_index, vec_loop_masks *masks)
5901 {
5902   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5903   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5904   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5905   stmt_vec_info new_stmt_info = NULL;
5906
5907   int ncopies;
5908   if (slp_node)
5909     ncopies = 1;
5910   else
5911     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
5912
5913   gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
5914   gcc_assert (ncopies == 1);
5915   gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
5916   gcc_assert (reduc_index == (code == MINUS_EXPR ? 0 : 1));
5917   gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5918               == FOLD_LEFT_REDUCTION);
5919
5920   if (slp_node)
5921     gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
5922                           TYPE_VECTOR_SUBPARTS (vectype_in)));
5923
5924   tree op0 = ops[1 - reduc_index];
5925
5926   int group_size = 1;
5927   stmt_vec_info scalar_dest_def_info;
5928   auto_vec<tree> vec_oprnds0;
5929   if (slp_node)
5930     {
5931       auto_vec<vec<tree> > vec_defs (2);
5932       auto_vec<tree> sops(2);
5933       sops.quick_push (ops[0]);
5934       sops.quick_push (ops[1]);
5935       vect_get_slp_defs (sops, slp_node, &vec_defs);
5936       vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
5937       vec_defs[0].release ();
5938       vec_defs[1].release ();
5939       group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
5940       scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5941     }
5942   else
5943     {
5944       tree loop_vec_def0 = vect_get_vec_def_for_operand (op0, stmt_info);
5945       vec_oprnds0.create (1);
5946       vec_oprnds0.quick_push (loop_vec_def0);
5947       scalar_dest_def_info = stmt_info;
5948     }
5949
5950   tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
5951   tree scalar_type = TREE_TYPE (scalar_dest);
5952   tree reduc_var = gimple_phi_result (reduc_def_stmt);
5953
5954   int vec_num = vec_oprnds0.length ();
5955   gcc_assert (vec_num == 1 || slp_node);
5956   tree vec_elem_type = TREE_TYPE (vectype_out);
5957   gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
5958
5959   tree vector_identity = NULL_TREE;
5960   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5961     vector_identity = build_zero_cst (vectype_out);
5962
5963   tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
5964   int i;
5965   tree def0;
5966   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5967     {
5968       gimple *new_stmt;
5969       tree mask = NULL_TREE;
5970       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5971         mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
5972
5973       /* Handle MINUS by adding the negative.  */
5974       if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
5975         {
5976           tree negated = make_ssa_name (vectype_out);
5977           new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
5978           gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5979           def0 = negated;
5980         }
5981
5982       if (mask)
5983         def0 = merge_with_identity (gsi, mask, vectype_out, def0,
5984                                     vector_identity);
5985
5986       /* On the first iteration the input is simply the scalar phi
5987          result, and for subsequent iterations it is the output of
5988          the preceding operation.  */
5989       if (reduc_fn != IFN_LAST)
5990         {
5991           new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var, def0);
5992           /* For chained SLP reductions the output of the previous reduction
5993              operation serves as the input of the next. For the final statement
5994              the output cannot be a temporary - we reuse the original
5995              scalar destination of the last statement.  */
5996           if (i != vec_num - 1)
5997             {
5998               gimple_set_lhs (new_stmt, scalar_dest_var);
5999               reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
6000               gimple_set_lhs (new_stmt, reduc_var);
6001             }
6002         }
6003       else
6004         {
6005           reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
6006                                              reduc_var, def0);
6007           new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6008           /* Remove the statement, so that we can use the same code paths
6009              as for statements that we've just created.  */
6010           gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6011           gsi_remove (&tmp_gsi, true);
6012         }
6013
6014       if (i == vec_num - 1)
6015         {
6016           gimple_set_lhs (new_stmt, scalar_dest);
6017           new_stmt_info = vect_finish_replace_stmt (scalar_dest_def_info,
6018                                                     new_stmt);
6019         }
6020       else
6021         new_stmt_info = vect_finish_stmt_generation (scalar_dest_def_info,
6022                                                      new_stmt, gsi);
6023
6024       if (slp_node)
6025         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
6026     }
6027
6028   if (!slp_node)
6029     STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
6030
6031   return true;
6032 }
6033
6034 /* Function is_nonwrapping_integer_induction.
6035
6036    Check if STMT_VINO (which is part of loop LOOP) both increments and
6037    does not cause overflow.  */
6038
6039 static bool
6040 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, struct loop *loop)
6041 {
6042   gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
6043   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
6044   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
6045   tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
6046   widest_int ni, max_loop_value, lhs_max;
6047   wi::overflow_type overflow = wi::OVF_NONE;
6048
6049   /* Make sure the loop is integer based.  */
6050   if (TREE_CODE (base) != INTEGER_CST
6051       || TREE_CODE (step) != INTEGER_CST)
6052     return false;
6053
6054   /* Check that the max size of the loop will not wrap.  */
6055
6056   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6057     return true;
6058
6059   if (! max_stmt_executions (loop, &ni))
6060     return false;
6061
6062   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6063                             &overflow);
6064   if (overflow)
6065     return false;
6066
6067   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6068                             TYPE_SIGN (lhs_type), &overflow);
6069   if (overflow)
6070     return false;
6071
6072   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6073           <= TYPE_PRECISION (lhs_type));
6074 }
6075
6076 /* Check if masking can be supported by inserting a conditional expression.
6077    CODE is the code for the operation.  COND_FN is the conditional internal
6078    function, if it exists.  VECTYPE_IN is the type of the vector input.  */
6079 static bool
6080 use_mask_by_cond_expr_p (enum tree_code code, internal_fn cond_fn,
6081                          tree vectype_in)
6082 {
6083   if (cond_fn != IFN_LAST
6084       && direct_internal_fn_supported_p (cond_fn, vectype_in,
6085                                          OPTIMIZE_FOR_SPEED))
6086     return false;
6087
6088   switch (code)
6089     {
6090     case DOT_PROD_EXPR:
6091     case SAD_EXPR:
6092       return true;
6093
6094     default:
6095       return false;
6096     }
6097 }
6098
6099 /* Insert a conditional expression to enable masked vectorization.  CODE is the
6100    code for the operation.  VOP is the array of operands.  MASK is the loop
6101    mask.  GSI is a statement iterator used to place the new conditional
6102    expression.  */
6103 static void
6104 build_vect_cond_expr (enum tree_code code, tree vop[3], tree mask,
6105                       gimple_stmt_iterator *gsi)
6106 {
6107   switch (code)
6108     {
6109     case DOT_PROD_EXPR:
6110       {
6111         tree vectype = TREE_TYPE (vop[1]);
6112         tree zero = build_zero_cst (vectype);
6113         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6114         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6115                                                mask, vop[1], zero);
6116         gsi_insert_before (gsi, select, GSI_SAME_STMT);
6117         vop[1] = masked_op1;
6118         break;
6119       }
6120
6121     case SAD_EXPR:
6122       {
6123         tree vectype = TREE_TYPE (vop[1]);
6124         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6125         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6126                                                mask, vop[1], vop[0]);
6127         gsi_insert_before (gsi, select, GSI_SAME_STMT);
6128         vop[1] = masked_op1;
6129         break;
6130       }
6131
6132     default:
6133       gcc_unreachable ();
6134     }
6135 }
6136
6137 /* Function vectorizable_reduction.
6138
6139    Check if STMT_INFO performs a reduction operation that can be vectorized.
6140    If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
6141    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6142    Return true if STMT_INFO is vectorizable in this way.
6143
6144    This function also handles reduction idioms (patterns) that have been
6145    recognized in advance during vect_pattern_recog.  In this case, STMT_INFO
6146    may be of this form:
6147      X = pattern_expr (arg0, arg1, ..., X)
6148    and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
6149    sequence that had been detected and replaced by the pattern-stmt
6150    (STMT_INFO).
6151
6152    This function also handles reduction of condition expressions, for example:
6153      for (int i = 0; i < N; i++)
6154        if (a[i] < value)
6155          last = a[i];
6156    This is handled by vectorising the loop and creating an additional vector
6157    containing the loop indexes for which "a[i] < value" was true.  In the
6158    function epilogue this is reduced to a single max value and then used to
6159    index into the vector of results.
6160
6161    In some cases of reduction patterns, the type of the reduction variable X is
6162    different than the type of the other arguments of STMT_INFO.
6163    In such cases, the vectype that is used when transforming STMT_INFO into
6164    a vector stmt is different than the vectype that is used to determine the
6165    vectorization factor, because it consists of a different number of elements
6166    than the actual number of elements that are being operated upon in parallel.
6167
6168    For example, consider an accumulation of shorts into an int accumulator.
6169    On some targets it's possible to vectorize this pattern operating on 8
6170    shorts at a time (hence, the vectype for purposes of determining the
6171    vectorization factor should be V8HI); on the other hand, the vectype that
6172    is used to create the vector form is actually V4SI (the type of the result).
6173
6174    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6175    indicates what is the actual level of parallelism (V8HI in the example), so
6176    that the right vectorization factor would be derived.  This vectype
6177    corresponds to the type of arguments to the reduction stmt, and should *NOT*
6178    be used to create the vectorized stmt.  The right vectype for the vectorized
6179    stmt is obtained from the type of the result X:
6180         get_vectype_for_scalar_type (TREE_TYPE (X))
6181
6182    This means that, contrary to "regular" reductions (or "regular" stmts in
6183    general), the following equation:
6184       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
6185    does *NOT* necessarily hold for reduction patterns.  */
6186
6187 bool
6188 vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
6189                         stmt_vec_info *vec_stmt, slp_tree slp_node,
6190                         slp_instance slp_node_instance,
6191                         stmt_vector_for_cost *cost_vec)
6192 {
6193   tree vec_dest;
6194   tree scalar_dest;
6195   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6196   tree vectype_in = NULL_TREE;
6197   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6198   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6199   enum tree_code code, orig_code;
6200   internal_fn reduc_fn;
6201   machine_mode vec_mode;
6202   int op_type;
6203   optab optab;
6204   tree new_temp = NULL_TREE;
6205   enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type;
6206   stmt_vec_info cond_stmt_vinfo = NULL;
6207   enum tree_code cond_reduc_op_code = ERROR_MARK;
6208   tree scalar_type;
6209   bool is_simple_use;
6210   int i;
6211   int ncopies;
6212   int epilog_copies;
6213   stmt_vec_info prev_stmt_info, prev_phi_info;
6214   bool single_defuse_cycle = false;
6215   stmt_vec_info new_stmt_info = NULL;
6216   int j;
6217   tree ops[3];
6218   enum vect_def_type dts[3];
6219   bool nested_cycle = false, found_nested_cycle_def = false;
6220   bool double_reduc = false;
6221   basic_block def_bb;
6222   struct loop * def_stmt_loop;
6223   tree def_arg;
6224   auto_vec<tree> vec_oprnds0;
6225   auto_vec<tree> vec_oprnds1;
6226   auto_vec<tree> vec_oprnds2;
6227   auto_vec<tree> vect_defs;
6228   auto_vec<stmt_vec_info> phis;
6229   int vec_num;
6230   tree def0, tem;
6231   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6232   tree cond_reduc_val = NULL_TREE;
6233
6234   /* Make sure it was already recognized as a reduction computation.  */
6235   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
6236       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
6237     return false;
6238
6239   if (nested_in_vect_loop_p (loop, stmt_info))
6240     {
6241       loop = loop->inner;
6242       nested_cycle = true;
6243     }
6244
6245   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6246     gcc_assert (slp_node
6247                 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
6248
6249   if (gphi *phi = dyn_cast <gphi *> (stmt_info->stmt))
6250     {
6251       tree phi_result = gimple_phi_result (phi);
6252       /* Analysis is fully done on the reduction stmt invocation.  */
6253       if (! vec_stmt)
6254         {
6255           if (slp_node)
6256             slp_node_instance->reduc_phis = slp_node;
6257
6258           STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6259           return true;
6260         }
6261
6262       if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6263         /* Leave the scalar phi in place.  Note that checking
6264            STMT_VINFO_VEC_REDUCTION_TYPE (as below) only works
6265            for reductions involving a single statement.  */
6266         return true;
6267
6268       stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
6269       reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
6270
6271       if (STMT_VINFO_VEC_REDUCTION_TYPE (reduc_stmt_info)
6272           == EXTRACT_LAST_REDUCTION)
6273         /* Leave the scalar phi in place.  */
6274         return true;
6275
6276       gassign *reduc_stmt = as_a <gassign *> (reduc_stmt_info->stmt);
6277       code = gimple_assign_rhs_code (reduc_stmt);
6278       for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k)
6279         {
6280           tree op = gimple_op (reduc_stmt, k);
6281           if (op == phi_result)
6282             continue;
6283           if (k == 1 && code == COND_EXPR)
6284             continue;
6285           bool is_simple_use = vect_is_simple_use (op, loop_vinfo, &dt);
6286           gcc_assert (is_simple_use);
6287           if (dt == vect_constant_def || dt == vect_external_def)
6288             continue;
6289           if (!vectype_in
6290               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6291                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (op)))))
6292             vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op));
6293           break;
6294         }
6295       /* For a nested cycle we might end up with an operation like
6296          phi_result * phi_result.  */
6297       if (!vectype_in)
6298         vectype_in = STMT_VINFO_VECTYPE (stmt_info);
6299       gcc_assert (vectype_in);
6300
6301       if (slp_node)
6302         ncopies = 1;
6303       else
6304         ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6305
6306       stmt_vec_info use_stmt_info;
6307       if (ncopies > 1
6308           && STMT_VINFO_RELEVANT (reduc_stmt_info) <= vect_used_only_live
6309           && (use_stmt_info = loop_vinfo->lookup_single_use (phi_result))
6310           && vect_stmt_to_vectorize (use_stmt_info) == reduc_stmt_info)
6311         single_defuse_cycle = true;
6312
6313       /* Create the destination vector  */
6314       scalar_dest = gimple_assign_lhs (reduc_stmt);
6315       vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6316
6317       if (slp_node)
6318         /* The size vect_schedule_slp_instance computes is off for us.  */
6319         vec_num = vect_get_num_vectors
6320           (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6321            * SLP_TREE_SCALAR_STMTS (slp_node).length (),
6322            vectype_in);
6323       else
6324         vec_num = 1;
6325
6326       /* Generate the reduction PHIs upfront.  */
6327       prev_phi_info = NULL;
6328       for (j = 0; j < ncopies; j++)
6329         {
6330           if (j == 0 || !single_defuse_cycle)
6331             {
6332               for (i = 0; i < vec_num; i++)
6333                 {
6334                   /* Create the reduction-phi that defines the reduction
6335                      operand.  */
6336                   gimple *new_phi = create_phi_node (vec_dest, loop->header);
6337                   stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
6338
6339                   if (slp_node)
6340                     SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi_info);
6341                   else
6342                     {
6343                       if (j == 0)
6344                         STMT_VINFO_VEC_STMT (stmt_info)
6345                           = *vec_stmt = new_phi_info;
6346                       else
6347                         STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi_info;
6348                       prev_phi_info = new_phi_info;
6349                     }
6350                 }
6351             }
6352         }
6353
6354       return true;
6355     }
6356
6357   /* 1. Is vectorizable reduction?  */
6358   /* Not supportable if the reduction variable is used in the loop, unless
6359      it's a reduction chain.  */
6360   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6361       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6362     return false;
6363
6364   /* Reductions that are not used even in an enclosing outer-loop,
6365      are expected to be "live" (used out of the loop).  */
6366   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6367       && !STMT_VINFO_LIVE_P (stmt_info))
6368     return false;
6369
6370   /* 2. Has this been recognized as a reduction pattern?
6371
6372      Check if STMT represents a pattern that has been recognized
6373      in earlier analysis stages.  For stmts that represent a pattern,
6374      the STMT_VINFO_RELATED_STMT field records the last stmt in
6375      the original sequence that constitutes the pattern.  */
6376
6377   stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6378   if (orig_stmt_info)
6379     {
6380       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6381       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6382     }
6383
6384   /* 3. Check the operands of the operation.  The first operands are defined
6385         inside the loop body. The last operand is the reduction variable,
6386         which is defined by the loop-header-phi.  */
6387
6388   gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6389
6390   /* Flatten RHS.  */
6391   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
6392     {
6393     case GIMPLE_BINARY_RHS:
6394       code = gimple_assign_rhs_code (stmt);
6395       op_type = TREE_CODE_LENGTH (code);
6396       gcc_assert (op_type == binary_op);
6397       ops[0] = gimple_assign_rhs1 (stmt);
6398       ops[1] = gimple_assign_rhs2 (stmt);
6399       break;
6400
6401     case GIMPLE_TERNARY_RHS:
6402       code = gimple_assign_rhs_code (stmt);
6403       op_type = TREE_CODE_LENGTH (code);
6404       gcc_assert (op_type == ternary_op);
6405       ops[0] = gimple_assign_rhs1 (stmt);
6406       ops[1] = gimple_assign_rhs2 (stmt);
6407       ops[2] = gimple_assign_rhs3 (stmt);
6408       break;
6409
6410     case GIMPLE_UNARY_RHS:
6411       return false;
6412
6413     default:
6414       gcc_unreachable ();
6415     }
6416
6417   if (code == COND_EXPR && slp_node)
6418     return false;
6419
6420   scalar_dest = gimple_assign_lhs (stmt);
6421   scalar_type = TREE_TYPE (scalar_dest);
6422   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6423       && !SCALAR_FLOAT_TYPE_P (scalar_type))
6424     return false;
6425
6426   /* Do not try to vectorize bit-precision reductions.  */
6427   if (!type_has_mode_precision_p (scalar_type))
6428     return false;
6429
6430   /* All uses but the last are expected to be defined in the loop.
6431      The last use is the reduction variable.  In case of nested cycle this
6432      assumption is not true: we use reduc_index to record the index of the
6433      reduction variable.  */
6434   stmt_vec_info reduc_def_info;
6435   if (orig_stmt_info)
6436     reduc_def_info = STMT_VINFO_REDUC_DEF (orig_stmt_info);
6437   else
6438     reduc_def_info = STMT_VINFO_REDUC_DEF (stmt_info);
6439   gcc_assert (reduc_def_info);
6440   gphi *reduc_def_phi = as_a <gphi *> (reduc_def_info->stmt);
6441   tree reduc_def = PHI_RESULT (reduc_def_phi);
6442   int reduc_index = -1;
6443   for (i = 0; i < op_type; i++)
6444     {
6445       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
6446       if (i == 0 && code == COND_EXPR)
6447         continue;
6448
6449       stmt_vec_info def_stmt_info;
6450       is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, &dts[i], &tem,
6451                                           &def_stmt_info);
6452       dt = dts[i];
6453       gcc_assert (is_simple_use);
6454       if (dt == vect_reduction_def
6455           && ops[i] == reduc_def)
6456         {
6457           reduc_index = i;
6458           continue;
6459         }
6460       else if (tem)
6461         {
6462           /* To properly compute ncopies we are interested in the widest
6463              input type in case we're looking at a widening accumulation.  */
6464           if (!vectype_in
6465               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6466                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem)))))
6467             vectype_in = tem;
6468         }
6469
6470       if (dt != vect_internal_def
6471           && dt != vect_external_def
6472           && dt != vect_constant_def
6473           && dt != vect_induction_def
6474           && !(dt == vect_nested_cycle && nested_cycle))
6475         return false;
6476
6477       if (dt == vect_nested_cycle
6478           && ops[i] == reduc_def)
6479         {
6480           found_nested_cycle_def = true;
6481           reduc_index = i;
6482         }
6483
6484       if (i == 1 && code == COND_EXPR)
6485         {
6486           /* Record how value of COND_EXPR is defined.  */
6487           if (dt == vect_constant_def)
6488             {
6489               cond_reduc_dt = dt;
6490               cond_reduc_val = ops[i];
6491             }
6492           if (dt == vect_induction_def
6493               && def_stmt_info
6494               && is_nonwrapping_integer_induction (def_stmt_info, loop))
6495             {
6496               cond_reduc_dt = dt;
6497               cond_stmt_vinfo = def_stmt_info;
6498             }
6499         }
6500     }
6501
6502   if (!vectype_in)
6503     vectype_in = vectype_out;
6504
6505   /* When vectorizing a reduction chain w/o SLP the reduction PHI is not
6506      directy used in stmt.  */
6507   if (reduc_index == -1)
6508     {
6509       if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6510         {
6511           if (dump_enabled_p ())
6512             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6513                              "in-order reduction chain without SLP.\n");
6514           return false;
6515         }
6516     }
6517
6518   if (!(reduc_index == -1
6519         || dts[reduc_index] == vect_reduction_def
6520         || dts[reduc_index] == vect_nested_cycle
6521         || ((dts[reduc_index] == vect_internal_def
6522              || dts[reduc_index] == vect_external_def
6523              || dts[reduc_index] == vect_constant_def
6524              || dts[reduc_index] == vect_induction_def)
6525             && nested_cycle && found_nested_cycle_def)))
6526     {
6527       /* For pattern recognized stmts, orig_stmt might be a reduction,
6528          but some helper statements for the pattern might not, or
6529          might be COND_EXPRs with reduction uses in the condition.  */
6530       gcc_assert (orig_stmt_info);
6531       return false;
6532     }
6533
6534   /* PHIs should not participate in patterns.  */
6535   gcc_assert (!STMT_VINFO_RELATED_STMT (reduc_def_info));
6536   enum vect_reduction_type v_reduc_type
6537     = STMT_VINFO_REDUC_TYPE (reduc_def_info);
6538   stmt_vec_info tmp = STMT_VINFO_REDUC_DEF (reduc_def_info);
6539
6540   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
6541   /* If we have a condition reduction, see if we can simplify it further.  */
6542   if (v_reduc_type == COND_REDUCTION)
6543     {
6544       /* TODO: We can't yet handle reduction chains, since we need to treat
6545          each COND_EXPR in the chain specially, not just the last one.
6546          E.g. for:
6547
6548             x_1 = PHI <x_3, ...>
6549             x_2 = a_2 ? ... : x_1;
6550             x_3 = a_3 ? ... : x_2;
6551
6552          we're interested in the last element in x_3 for which a_2 || a_3
6553          is true, whereas the current reduction chain handling would
6554          vectorize x_2 as a normal VEC_COND_EXPR and only treat x_3
6555          as a reduction operation.  */
6556       if (reduc_index == -1)
6557         {
6558           if (dump_enabled_p ())
6559             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6560                              "conditional reduction chains not supported\n");
6561           return false;
6562         }
6563
6564       /* vect_is_simple_reduction ensured that operand 2 is the
6565          loop-carried operand.  */
6566       gcc_assert (reduc_index == 2);
6567
6568       /* Loop peeling modifies initial value of reduction PHI, which
6569          makes the reduction stmt to be transformed different to the
6570          original stmt analyzed.  We need to record reduction code for
6571          CONST_COND_REDUCTION type reduction at analyzing stage, thus
6572          it can be used directly at transform stage.  */
6573       if (STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MAX_EXPR
6574           || STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MIN_EXPR)
6575         {
6576           /* Also set the reduction type to CONST_COND_REDUCTION.  */
6577           gcc_assert (cond_reduc_dt == vect_constant_def);
6578           STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = CONST_COND_REDUCTION;
6579         }
6580       else if (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6581                                                vectype_in, OPTIMIZE_FOR_SPEED))
6582         {
6583           if (dump_enabled_p ())
6584             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6585                              "optimizing condition reduction with"
6586                              " FOLD_EXTRACT_LAST.\n");
6587           STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = EXTRACT_LAST_REDUCTION;
6588         }
6589       else if (cond_reduc_dt == vect_induction_def)
6590         {
6591           tree base
6592             = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6593           tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6594
6595           gcc_assert (TREE_CODE (base) == INTEGER_CST
6596                       && TREE_CODE (step) == INTEGER_CST);
6597           cond_reduc_val = NULL_TREE;
6598           /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6599              above base; punt if base is the minimum value of the type for
6600              MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
6601           if (tree_int_cst_sgn (step) == -1)
6602             {
6603               cond_reduc_op_code = MIN_EXPR;
6604               if (tree_int_cst_sgn (base) == -1)
6605                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6606               else if (tree_int_cst_lt (base,
6607                                         TYPE_MAX_VALUE (TREE_TYPE (base))))
6608                 cond_reduc_val
6609                   = int_const_binop (PLUS_EXPR, base, integer_one_node);
6610             }
6611           else
6612             {
6613               cond_reduc_op_code = MAX_EXPR;
6614               if (tree_int_cst_sgn (base) == 1)
6615                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6616               else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6617                                         base))
6618                 cond_reduc_val
6619                   = int_const_binop (MINUS_EXPR, base, integer_one_node);
6620             }
6621           if (cond_reduc_val)
6622             {
6623               if (dump_enabled_p ())
6624                 dump_printf_loc (MSG_NOTE, vect_location,
6625                                  "condition expression based on "
6626                                  "integer induction.\n");
6627               STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6628                 = INTEGER_INDUC_COND_REDUCTION;
6629             }
6630         }
6631       else if (cond_reduc_dt == vect_constant_def)
6632         {
6633           enum vect_def_type cond_initial_dt;
6634           gimple *def_stmt = SSA_NAME_DEF_STMT (ops[reduc_index]);
6635           tree cond_initial_val
6636             = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
6637
6638           gcc_assert (cond_reduc_val != NULL_TREE);
6639           vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
6640           if (cond_initial_dt == vect_constant_def
6641               && types_compatible_p (TREE_TYPE (cond_initial_val),
6642                                      TREE_TYPE (cond_reduc_val)))
6643             {
6644               tree e = fold_binary (LE_EXPR, boolean_type_node,
6645                                     cond_initial_val, cond_reduc_val);
6646               if (e && (integer_onep (e) || integer_zerop (e)))
6647                 {
6648                   if (dump_enabled_p ())
6649                     dump_printf_loc (MSG_NOTE, vect_location,
6650                                      "condition expression based on "
6651                                      "compile time constant.\n");
6652                   /* Record reduction code at analysis stage.  */
6653                   STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info)
6654                     = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6655                   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6656                     = CONST_COND_REDUCTION;
6657                 }
6658             }
6659         }
6660     }
6661
6662   if (orig_stmt_info)
6663     gcc_assert (tmp == orig_stmt_info
6664                 || REDUC_GROUP_FIRST_ELEMENT (tmp) == orig_stmt_info);
6665   else
6666     /* We changed STMT to be the first stmt in reduction chain, hence we
6667        check that in this case the first element in the chain is STMT.  */
6668     gcc_assert (tmp == stmt_info
6669                 || REDUC_GROUP_FIRST_ELEMENT (tmp) == stmt_info);
6670
6671   if (STMT_VINFO_LIVE_P (reduc_def_info))
6672     return false;
6673
6674   if (slp_node)
6675     ncopies = 1;
6676   else
6677     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6678
6679   gcc_assert (ncopies >= 1);
6680
6681   vec_mode = TYPE_MODE (vectype_in);
6682   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6683
6684   if (nested_cycle)
6685     {
6686       def_bb = gimple_bb (reduc_def_phi);
6687       def_stmt_loop = def_bb->loop_father;
6688       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
6689                                        loop_preheader_edge (def_stmt_loop));
6690       stmt_vec_info def_arg_stmt_info = loop_vinfo->lookup_def (def_arg);
6691       if (def_arg_stmt_info
6692           && (STMT_VINFO_DEF_TYPE (def_arg_stmt_info)
6693               == vect_double_reduction_def))
6694         double_reduc = true;
6695     }
6696
6697   vect_reduction_type reduction_type
6698     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
6699   if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
6700       && ncopies > 1)
6701     {
6702       if (dump_enabled_p ())
6703         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6704                          "multiple types in double reduction or condition "
6705                          "reduction.\n");
6706       return false;
6707     }
6708
6709   if (code == COND_EXPR)
6710     {
6711       /* Only call during the analysis stage, otherwise we'll lose
6712          STMT_VINFO_TYPE.  */
6713       if (!vec_stmt && !vectorizable_condition (stmt_info, gsi, NULL,
6714                                                 true, NULL, cost_vec))
6715         {
6716           if (dump_enabled_p ())
6717             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6718                              "unsupported condition in reduction\n");
6719           return false;
6720         }
6721     }
6722   else if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
6723            || code == LROTATE_EXPR || code == RROTATE_EXPR)
6724     {
6725       /* Only call during the analysis stage, otherwise we'll lose
6726          STMT_VINFO_TYPE.  We only support this for nested cycles
6727          without double reductions at the moment.  */
6728       if (!nested_cycle
6729           || double_reduc
6730           || (!vec_stmt && !vectorizable_shift (stmt_info, gsi, NULL,
6731                                                 NULL, cost_vec)))
6732         {
6733           if (dump_enabled_p ())
6734             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6735                              "unsupported shift or rotation in reduction\n");
6736           return false;
6737         }
6738     }
6739   else
6740     {
6741       /* 4. Supportable by target?  */
6742
6743       /* 4.1. check support for the operation in the loop  */
6744       optab = optab_for_tree_code (code, vectype_in, optab_default);
6745       if (!optab)
6746         {
6747           if (dump_enabled_p ())
6748             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6749                              "no optab.\n");
6750
6751           return false;
6752         }
6753
6754       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6755         {
6756           if (dump_enabled_p ())
6757             dump_printf (MSG_NOTE, "op not supported by target.\n");
6758
6759           if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
6760               || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6761             return false;
6762
6763           if (dump_enabled_p ())
6764             dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6765         }
6766
6767       /* Worthwhile without SIMD support?  */
6768       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
6769           && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6770         {
6771           if (dump_enabled_p ())
6772             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6773                              "not worthwhile without SIMD support.\n");
6774
6775           return false;
6776         }
6777     }
6778
6779   /* 4.2. Check support for the epilog operation.
6780
6781           If STMT represents a reduction pattern, then the type of the
6782           reduction variable may be different than the type of the rest
6783           of the arguments.  For example, consider the case of accumulation
6784           of shorts into an int accumulator; The original code:
6785                         S1: int_a = (int) short_a;
6786           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
6787
6788           was replaced with:
6789                         STMT: int_acc = widen_sum <short_a, int_acc>
6790
6791           This means that:
6792           1. The tree-code that is used to create the vector operation in the
6793              epilog code (that reduces the partial results) is not the
6794              tree-code of STMT, but is rather the tree-code of the original
6795              stmt from the pattern that STMT is replacing.  I.e, in the example
6796              above we want to use 'widen_sum' in the loop, but 'plus' in the
6797              epilog.
6798           2. The type (mode) we use to check available target support
6799              for the vector operation to be created in the *epilog*, is
6800              determined by the type of the reduction variable (in the example
6801              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6802              However the type (mode) we use to check available target support
6803              for the vector operation to be created *inside the loop*, is
6804              determined by the type of the other arguments to STMT (in the
6805              example we'd check this: optab_handler (widen_sum_optab,
6806              vect_short_mode)).
6807
6808           This is contrary to "regular" reductions, in which the types of all
6809           the arguments are the same as the type of the reduction variable.
6810           For "regular" reductions we can therefore use the same vector type
6811           (and also the same tree-code) when generating the epilog code and
6812           when generating the code inside the loop.  */
6813
6814   if (orig_stmt_info
6815       && (reduction_type == TREE_CODE_REDUCTION
6816           || reduction_type == FOLD_LEFT_REDUCTION))
6817     {
6818       /* This is a reduction pattern: get the vectype from the type of the
6819          reduction variable, and get the tree-code from orig_stmt.  */
6820       orig_code = gimple_assign_rhs_code (orig_stmt_info->stmt);
6821       gcc_assert (vectype_out);
6822       vec_mode = TYPE_MODE (vectype_out);
6823     }
6824   else
6825     {
6826       /* Regular reduction: use the same vectype and tree-code as used for
6827          the vector code inside the loop can be used for the epilog code. */
6828       orig_code = code;
6829
6830       if (code == MINUS_EXPR)
6831         orig_code = PLUS_EXPR;
6832
6833       /* For simple condition reductions, replace with the actual expression
6834          we want to base our reduction around.  */
6835       if (reduction_type == CONST_COND_REDUCTION)
6836         {
6837           orig_code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
6838           gcc_assert (orig_code == MAX_EXPR || orig_code == MIN_EXPR);
6839         }
6840       else if (reduction_type == INTEGER_INDUC_COND_REDUCTION)
6841         orig_code = cond_reduc_op_code;
6842     }
6843
6844   reduc_fn = IFN_LAST;
6845
6846   if (reduction_type == TREE_CODE_REDUCTION
6847       || reduction_type == FOLD_LEFT_REDUCTION
6848       || reduction_type == INTEGER_INDUC_COND_REDUCTION
6849       || reduction_type == CONST_COND_REDUCTION)
6850     {
6851       if (reduction_type == FOLD_LEFT_REDUCTION
6852           ? fold_left_reduction_fn (orig_code, &reduc_fn)
6853           : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6854         {
6855           if (reduc_fn != IFN_LAST
6856               && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6857                                                   OPTIMIZE_FOR_SPEED))
6858             {
6859               if (dump_enabled_p ())
6860                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6861                                  "reduc op not supported by target.\n");
6862
6863               reduc_fn = IFN_LAST;
6864             }
6865         }
6866       else
6867         {
6868           if (!nested_cycle || double_reduc)
6869             {
6870               if (dump_enabled_p ())
6871                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6872                                  "no reduc code for scalar code.\n");
6873
6874               return false;
6875             }
6876         }
6877     }
6878   else if (reduction_type == COND_REDUCTION)
6879     {
6880       int scalar_precision
6881         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6882       cr_index_scalar_type = make_unsigned_type (scalar_precision);
6883       cr_index_vector_type = build_vector_type (cr_index_scalar_type,
6884                                                 nunits_out);
6885
6886       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
6887                                           OPTIMIZE_FOR_SPEED))
6888         reduc_fn = IFN_REDUC_MAX;
6889     }
6890
6891   if (reduction_type != EXTRACT_LAST_REDUCTION
6892       && (!nested_cycle || double_reduc)
6893       && reduc_fn == IFN_LAST
6894       && !nunits_out.is_constant ())
6895     {
6896       if (dump_enabled_p ())
6897         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6898                          "missing target support for reduction on"
6899                          " variable-length vectors.\n");
6900       return false;
6901     }
6902
6903   /* For SLP reductions, see if there is a neutral value we can use.  */
6904   tree neutral_op = NULL_TREE;
6905   if (slp_node)
6906     neutral_op = neutral_op_for_slp_reduction
6907       (slp_node_instance->reduc_phis, code,
6908        REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL);
6909
6910   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
6911     {
6912       /* We can't support in-order reductions of code such as this:
6913
6914            for (int i = 0; i < n1; ++i)
6915              for (int j = 0; j < n2; ++j)
6916                l += a[j];
6917
6918          since GCC effectively transforms the loop when vectorizing:
6919
6920            for (int i = 0; i < n1 / VF; ++i)
6921              for (int j = 0; j < n2; ++j)
6922                for (int k = 0; k < VF; ++k)
6923                  l += a[j];
6924
6925          which is a reassociation of the original operation.  */
6926       if (dump_enabled_p ())
6927         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6928                          "in-order double reduction not supported.\n");
6929
6930       return false;
6931     }
6932
6933   if (reduction_type == FOLD_LEFT_REDUCTION
6934       && slp_node
6935       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6936     {
6937       /* We cannot use in-order reductions in this case because there is
6938          an implicit reassociation of the operations involved.  */
6939       if (dump_enabled_p ())
6940         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6941                          "in-order unchained SLP reductions not supported.\n");
6942       return false;
6943     }
6944
6945   /* For double reductions, and for SLP reductions with a neutral value,
6946      we construct a variable-length initial vector by loading a vector
6947      full of the neutral value and then shift-and-inserting the start
6948      values into the low-numbered elements.  */
6949   if ((double_reduc || neutral_op)
6950       && !nunits_out.is_constant ()
6951       && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
6952                                           vectype_out, OPTIMIZE_FOR_SPEED))
6953     {
6954       if (dump_enabled_p ())
6955         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6956                          "reduction on variable-length vectors requires"
6957                          " target support for a vector-shift-and-insert"
6958                          " operation.\n");
6959       return false;
6960     }
6961
6962   /* Check extra constraints for variable-length unchained SLP reductions.  */
6963   if (STMT_SLP_TYPE (stmt_info)
6964       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6965       && !nunits_out.is_constant ())
6966     {
6967       /* We checked above that we could build the initial vector when
6968          there's a neutral element value.  Check here for the case in
6969          which each SLP statement has its own initial value and in which
6970          that value needs to be repeated for every instance of the
6971          statement within the initial vector.  */
6972       unsigned int group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6973       scalar_mode elt_mode = SCALAR_TYPE_MODE (TREE_TYPE (vectype_out));
6974       if (!neutral_op
6975           && !can_duplicate_and_interleave_p (group_size, elt_mode))
6976         {
6977           if (dump_enabled_p ())
6978             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6979                              "unsupported form of SLP reduction for"
6980                              " variable-length vectors: cannot build"
6981                              " initial vector.\n");
6982           return false;
6983         }
6984       /* The epilogue code relies on the number of elements being a multiple
6985          of the group size.  The duplicate-and-interleave approach to setting
6986          up the the initial vector does too.  */
6987       if (!multiple_p (nunits_out, group_size))
6988         {
6989           if (dump_enabled_p ())
6990             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6991                              "unsupported form of SLP reduction for"
6992                              " variable-length vectors: the vector size"
6993                              " is not a multiple of the number of results.\n");
6994           return false;
6995         }
6996     }
6997
6998   /* In case of widenning multiplication by a constant, we update the type
6999      of the constant to be the type of the other operand.  We check that the
7000      constant fits the type in the pattern recognition pass.  */
7001   if (code == DOT_PROD_EXPR
7002       && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
7003     {
7004       if (TREE_CODE (ops[0]) == INTEGER_CST)
7005         ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
7006       else if (TREE_CODE (ops[1]) == INTEGER_CST)
7007         ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
7008       else
7009         {
7010           if (dump_enabled_p ())
7011             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7012                              "invalid types in dot-prod\n");
7013
7014           return false;
7015         }
7016     }
7017
7018   if (reduction_type == COND_REDUCTION)
7019     {
7020       widest_int ni;
7021
7022       if (! max_loop_iterations (loop, &ni))
7023         {
7024           if (dump_enabled_p ())
7025             dump_printf_loc (MSG_NOTE, vect_location,
7026                              "loop count not known, cannot create cond "
7027                              "reduction.\n");
7028           return false;
7029         }
7030       /* Convert backedges to iterations.  */
7031       ni += 1;
7032
7033       /* The additional index will be the same type as the condition.  Check
7034          that the loop can fit into this less one (because we'll use up the
7035          zero slot for when there are no matches).  */
7036       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
7037       if (wi::geu_p (ni, wi::to_widest (max_index)))
7038         {
7039           if (dump_enabled_p ())
7040             dump_printf_loc (MSG_NOTE, vect_location,
7041                              "loop size is greater than data size.\n");
7042           return false;
7043         }
7044     }
7045
7046   /* In case the vectorization factor (VF) is bigger than the number
7047      of elements that we can fit in a vectype (nunits), we have to generate
7048      more than one vector stmt - i.e - we need to "unroll" the
7049      vector stmt by a factor VF/nunits.  For more details see documentation
7050      in vectorizable_operation.  */
7051
7052   /* If the reduction is used in an outer loop we need to generate
7053      VF intermediate results, like so (e.g. for ncopies=2):
7054         r0 = phi (init, r0)
7055         r1 = phi (init, r1)
7056         r0 = x0 + r0;
7057         r1 = x1 + r1;
7058     (i.e. we generate VF results in 2 registers).
7059     In this case we have a separate def-use cycle for each copy, and therefore
7060     for each copy we get the vector def for the reduction variable from the
7061     respective phi node created for this copy.
7062
7063     Otherwise (the reduction is unused in the loop nest), we can combine
7064     together intermediate results, like so (e.g. for ncopies=2):
7065         r = phi (init, r)
7066         r = x0 + r;
7067         r = x1 + r;
7068    (i.e. we generate VF/2 results in a single register).
7069    In this case for each copy we get the vector def for the reduction variable
7070    from the vectorized reduction operation generated in the previous iteration.
7071
7072    This only works when we see both the reduction PHI and its only consumer
7073    in vectorizable_reduction and there are no intermediate stmts
7074    participating.  */
7075   stmt_vec_info use_stmt_info;
7076   tree reduc_phi_result = gimple_phi_result (reduc_def_phi);
7077   if (ncopies > 1
7078       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
7079       && (use_stmt_info = loop_vinfo->lookup_single_use (reduc_phi_result))
7080       && vect_stmt_to_vectorize (use_stmt_info) == stmt_info)
7081     {
7082       single_defuse_cycle = true;
7083       epilog_copies = 1;
7084     }
7085   else
7086     epilog_copies = ncopies;
7087
7088   /* If the reduction stmt is one of the patterns that have lane
7089      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
7090   if ((ncopies > 1
7091        && ! single_defuse_cycle)
7092       && (code == DOT_PROD_EXPR
7093           || code == WIDEN_SUM_EXPR
7094           || code == SAD_EXPR))
7095     {
7096       if (dump_enabled_p ())
7097         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7098                          "multi def-use cycle not possible for lane-reducing "
7099                          "reduction operation\n");
7100       return false;
7101     }
7102
7103   if (slp_node)
7104     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7105   else
7106     vec_num = 1;
7107
7108   internal_fn cond_fn = get_conditional_internal_fn (code);
7109   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7110   bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
7111
7112   if (!vec_stmt) /* transformation not required.  */
7113     {
7114       vect_model_reduction_cost (stmt_info, reduc_fn, ncopies, cost_vec);
7115       if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
7116         {
7117           if (reduction_type != FOLD_LEFT_REDUCTION
7118               && !mask_by_cond_expr
7119               && (cond_fn == IFN_LAST
7120                   || !direct_internal_fn_supported_p (cond_fn, vectype_in,
7121                                                       OPTIMIZE_FOR_SPEED)))
7122             {
7123               if (dump_enabled_p ())
7124                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7125                                  "can't use a fully-masked loop because no"
7126                                  " conditional operation is available.\n");
7127               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7128             }
7129           else if (reduc_index == -1)
7130             {
7131               if (dump_enabled_p ())
7132                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7133                                  "can't use a fully-masked loop for chained"
7134                                  " reductions.\n");
7135               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7136             }
7137           else
7138             vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
7139                                    vectype_in);
7140         }
7141       if (dump_enabled_p ()
7142           && reduction_type == FOLD_LEFT_REDUCTION)
7143         dump_printf_loc (MSG_NOTE, vect_location,
7144                          "using an in-order (fold-left) reduction.\n");
7145       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7146       return true;
7147     }
7148
7149   /* Transform.  */
7150
7151   if (dump_enabled_p ())
7152     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7153
7154   /* FORNOW: Multiple types are not supported for condition.  */
7155   if (code == COND_EXPR)
7156     gcc_assert (ncopies == 1);
7157
7158   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7159
7160   if (reduction_type == FOLD_LEFT_REDUCTION)
7161     return vectorize_fold_left_reduction
7162       (stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, code,
7163        reduc_fn, ops, vectype_in, reduc_index, masks);
7164
7165   if (reduction_type == EXTRACT_LAST_REDUCTION)
7166     {
7167       gcc_assert (!slp_node);
7168       return vectorizable_condition (stmt_info, gsi, vec_stmt,
7169                                      true, NULL, NULL);
7170     }
7171
7172   /* Create the destination vector  */
7173   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7174
7175   prev_stmt_info = NULL;
7176   prev_phi_info = NULL;
7177   if (!slp_node)
7178     {
7179       vec_oprnds0.create (1);
7180       vec_oprnds1.create (1);
7181       if (op_type == ternary_op)
7182         vec_oprnds2.create (1);
7183     }
7184
7185   phis.create (vec_num);
7186   vect_defs.create (vec_num);
7187   if (!slp_node)
7188     vect_defs.quick_push (NULL_TREE);
7189
7190   if (slp_node)
7191     phis.splice (SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis));
7192   else
7193     phis.quick_push (STMT_VINFO_VEC_STMT (reduc_def_info));
7194
7195   for (j = 0; j < ncopies; j++)
7196     {
7197       if (code == COND_EXPR)
7198         {
7199           gcc_assert (!slp_node);
7200           vectorizable_condition (stmt_info, gsi, vec_stmt,
7201                                   true, NULL, NULL);
7202           break;
7203         }
7204       if (code == LSHIFT_EXPR
7205           || code == RSHIFT_EXPR)
7206         {
7207           vectorizable_shift (stmt_info, gsi, vec_stmt, slp_node, NULL);
7208           break;
7209         }
7210
7211       /* Handle uses.  */
7212       if (j == 0)
7213         {
7214           if (slp_node)
7215             {
7216               /* Get vec defs for all the operands except the reduction index,
7217                  ensuring the ordering of the ops in the vector is kept.  */
7218               auto_vec<tree, 3> slp_ops;
7219               auto_vec<vec<tree>, 3> vec_defs;
7220
7221               slp_ops.quick_push (ops[0]);
7222               slp_ops.quick_push (ops[1]);
7223               if (op_type == ternary_op)
7224                 slp_ops.quick_push (ops[2]);
7225
7226               vect_get_slp_defs (slp_ops, slp_node, &vec_defs);
7227
7228               vec_oprnds0.safe_splice (vec_defs[0]);
7229               vec_defs[0].release ();
7230               vec_oprnds1.safe_splice (vec_defs[1]);
7231               vec_defs[1].release ();
7232               if (op_type == ternary_op)
7233                 {
7234                   vec_oprnds2.safe_splice (vec_defs[2]);
7235                   vec_defs[2].release ();
7236                 }
7237             }
7238           else
7239             {
7240               vec_oprnds0.quick_push
7241                 (vect_get_vec_def_for_operand (ops[0], stmt_info));
7242               vec_oprnds1.quick_push
7243                 (vect_get_vec_def_for_operand (ops[1], stmt_info));
7244               if (op_type == ternary_op)
7245                 vec_oprnds2.quick_push
7246                   (vect_get_vec_def_for_operand (ops[2], stmt_info));
7247             }
7248         }
7249       else
7250         {
7251           if (!slp_node)
7252             {
7253               gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
7254
7255               if (single_defuse_cycle && reduc_index == 0)
7256                 vec_oprnds0[0] = gimple_get_lhs (new_stmt_info->stmt);
7257               else
7258                 vec_oprnds0[0]
7259                   = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7260                                                     vec_oprnds0[0]);
7261               if (single_defuse_cycle && reduc_index == 1)
7262                 vec_oprnds1[0] = gimple_get_lhs (new_stmt_info->stmt);
7263               else
7264                 vec_oprnds1[0]
7265                   = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7266                                                     vec_oprnds1[0]);
7267               if (op_type == ternary_op)
7268                 {
7269                   if (single_defuse_cycle && reduc_index == 2)
7270                     vec_oprnds2[0] = gimple_get_lhs (new_stmt_info->stmt);
7271                   else
7272                     vec_oprnds2[0]
7273                       = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7274                                                         vec_oprnds2[0]);
7275                 }
7276             }
7277         }
7278
7279       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7280         {
7281           tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7282           if (masked_loop_p && !mask_by_cond_expr)
7283             {
7284               /* Make sure that the reduction accumulator is vop[0].  */
7285               if (reduc_index == 1)
7286                 {
7287                   gcc_assert (commutative_tree_code (code));
7288                   std::swap (vop[0], vop[1]);
7289                 }
7290               tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7291                                               vectype_in, i * ncopies + j);
7292               gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
7293                                                         vop[0], vop[1],
7294                                                         vop[0]);
7295               new_temp = make_ssa_name (vec_dest, call);
7296               gimple_call_set_lhs (call, new_temp);
7297               gimple_call_set_nothrow (call, true);
7298               new_stmt_info
7299                 = vect_finish_stmt_generation (stmt_info, call, gsi);
7300             }
7301           else
7302             {
7303               if (op_type == ternary_op)
7304                 vop[2] = vec_oprnds2[i];
7305
7306               if (masked_loop_p && mask_by_cond_expr)
7307                 {
7308                   tree mask = vect_get_loop_mask (gsi, masks,
7309                                                   vec_num * ncopies,
7310                                                   vectype_in, i * ncopies + j);
7311                   build_vect_cond_expr (code, vop, mask, gsi);
7312                 }
7313
7314               gassign *new_stmt = gimple_build_assign (vec_dest, code,
7315                                                        vop[0], vop[1], vop[2]);
7316               new_temp = make_ssa_name (vec_dest, new_stmt);
7317               gimple_assign_set_lhs (new_stmt, new_temp);
7318               new_stmt_info
7319                 = vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
7320             }
7321
7322           if (slp_node)
7323             {
7324               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
7325               vect_defs.quick_push (new_temp);
7326             }
7327           else
7328             vect_defs[0] = new_temp;
7329         }
7330
7331       if (slp_node)
7332         continue;
7333
7334       if (j == 0)
7335         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
7336       else
7337         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
7338
7339       prev_stmt_info = new_stmt_info;
7340     }
7341
7342   /* Finalize the reduction-phi (set its arguments) and create the
7343      epilog reduction code.  */
7344   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
7345     vect_defs[0] = gimple_get_lhs ((*vec_stmt)->stmt);
7346
7347   vect_create_epilog_for_reduction (vect_defs, stmt_info, reduc_def_phi,
7348                                     epilog_copies, reduc_fn, phis,
7349                                     double_reduc, slp_node, slp_node_instance,
7350                                     cond_reduc_val, cond_reduc_op_code,
7351                                     neutral_op);
7352
7353   return true;
7354 }
7355
7356 /* Function vect_min_worthwhile_factor.
7357
7358    For a loop where we could vectorize the operation indicated by CODE,
7359    return the minimum vectorization factor that makes it worthwhile
7360    to use generic vectors.  */
7361 static unsigned int
7362 vect_min_worthwhile_factor (enum tree_code code)
7363 {
7364   switch (code)
7365     {
7366     case PLUS_EXPR:
7367     case MINUS_EXPR:
7368     case NEGATE_EXPR:
7369       return 4;
7370
7371     case BIT_AND_EXPR:
7372     case BIT_IOR_EXPR:
7373     case BIT_XOR_EXPR:
7374     case BIT_NOT_EXPR:
7375       return 2;
7376
7377     default:
7378       return INT_MAX;
7379     }
7380 }
7381
7382 /* Return true if VINFO indicates we are doing loop vectorization and if
7383    it is worth decomposing CODE operations into scalar operations for
7384    that loop's vectorization factor.  */
7385
7386 bool
7387 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
7388 {
7389   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7390   unsigned HOST_WIDE_INT value;
7391   return (loop_vinfo
7392           && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7393           && value >= vect_min_worthwhile_factor (code));
7394 }
7395
7396 /* Function vectorizable_induction
7397
7398    Check if STMT_INFO performs an induction computation that can be vectorized.
7399    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7400    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7401    Return true if STMT_INFO is vectorizable in this way.  */
7402
7403 bool
7404 vectorizable_induction (stmt_vec_info stmt_info,
7405                         gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7406                         stmt_vec_info *vec_stmt, slp_tree slp_node,
7407                         stmt_vector_for_cost *cost_vec)
7408 {
7409   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7410   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7411   unsigned ncopies;
7412   bool nested_in_vect_loop = false;
7413   struct loop *iv_loop;
7414   tree vec_def;
7415   edge pe = loop_preheader_edge (loop);
7416   basic_block new_bb;
7417   tree new_vec, vec_init, vec_step, t;
7418   tree new_name;
7419   gimple *new_stmt;
7420   gphi *induction_phi;
7421   tree induc_def, vec_dest;
7422   tree init_expr, step_expr;
7423   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7424   unsigned i;
7425   tree expr;
7426   gimple_seq stmts;
7427   imm_use_iterator imm_iter;
7428   use_operand_p use_p;
7429   gimple *exit_phi;
7430   edge latch_e;
7431   tree loop_arg;
7432   gimple_stmt_iterator si;
7433
7434   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
7435   if (!phi)
7436     return false;
7437
7438   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7439     return false;
7440
7441   /* Make sure it was recognized as induction computation.  */
7442   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7443     return false;
7444
7445   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7446   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7447
7448   if (slp_node)
7449     ncopies = 1;
7450   else
7451     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7452   gcc_assert (ncopies >= 1);
7453
7454   /* FORNOW. These restrictions should be relaxed.  */
7455   if (nested_in_vect_loop_p (loop, stmt_info))
7456     {
7457       imm_use_iterator imm_iter;
7458       use_operand_p use_p;
7459       gimple *exit_phi;
7460       edge latch_e;
7461       tree loop_arg;
7462
7463       if (ncopies > 1)
7464         {
7465           if (dump_enabled_p ())
7466             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7467                              "multiple types in nested loop.\n");
7468           return false;
7469         }
7470
7471       /* FORNOW: outer loop induction with SLP not supported.  */
7472       if (STMT_SLP_TYPE (stmt_info))
7473         return false;
7474
7475       exit_phi = NULL;
7476       latch_e = loop_latch_edge (loop->inner);
7477       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7478       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7479         {
7480           gimple *use_stmt = USE_STMT (use_p);
7481           if (is_gimple_debug (use_stmt))
7482             continue;
7483
7484           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7485             {
7486               exit_phi = use_stmt;
7487               break;
7488             }
7489         }
7490       if (exit_phi)
7491         {
7492           stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7493           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7494                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7495             {
7496               if (dump_enabled_p ())
7497                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7498                                  "inner-loop induction only used outside "
7499                                  "of the outer vectorized loop.\n");
7500               return false;
7501             }
7502         }
7503
7504       nested_in_vect_loop = true;
7505       iv_loop = loop->inner;
7506     }
7507   else
7508     iv_loop = loop;
7509   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7510
7511   if (slp_node && !nunits.is_constant ())
7512     {
7513       /* The current SLP code creates the initial value element-by-element.  */
7514       if (dump_enabled_p ())
7515         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7516                          "SLP induction not supported for variable-length"
7517                          " vectors.\n");
7518       return false;
7519     }
7520
7521   if (!vec_stmt) /* transformation not required.  */
7522     {
7523       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
7524       DUMP_VECT_SCOPE ("vectorizable_induction");
7525       vect_model_induction_cost (stmt_info, ncopies, cost_vec);
7526       return true;
7527     }
7528
7529   /* Transform.  */
7530
7531   /* Compute a vector variable, initialized with the first VF values of
7532      the induction variable.  E.g., for an iv with IV_PHI='X' and
7533      evolution S, for a vector of 4 units, we want to compute:
7534      [X, X + S, X + 2*S, X + 3*S].  */
7535
7536   if (dump_enabled_p ())
7537     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
7538
7539   latch_e = loop_latch_edge (iv_loop);
7540   loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7541
7542   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
7543   gcc_assert (step_expr != NULL_TREE);
7544
7545   pe = loop_preheader_edge (iv_loop);
7546   init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
7547                                      loop_preheader_edge (iv_loop));
7548
7549   stmts = NULL;
7550   if (!nested_in_vect_loop)
7551     {
7552       /* Convert the initial value to the desired type.  */
7553       tree new_type = TREE_TYPE (vectype);
7554       init_expr = gimple_convert (&stmts, new_type, init_expr);
7555
7556       /* If we are using the loop mask to "peel" for alignment then we need
7557          to adjust the start value here.  */
7558       tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
7559       if (skip_niters != NULL_TREE)
7560         {
7561           if (FLOAT_TYPE_P (vectype))
7562             skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
7563                                         skip_niters);
7564           else
7565             skip_niters = gimple_convert (&stmts, new_type, skip_niters);
7566           tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
7567                                          skip_niters, step_expr);
7568           init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
7569                                     init_expr, skip_step);
7570         }
7571     }
7572
7573   /* Convert the step to the desired type.  */
7574   step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
7575
7576   if (stmts)
7577     {
7578       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7579       gcc_assert (!new_bb);
7580     }
7581
7582   /* Find the first insertion point in the BB.  */
7583   basic_block bb = gimple_bb (phi);
7584   si = gsi_after_labels (bb);
7585
7586   /* For SLP induction we have to generate several IVs as for example
7587      with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
7588      [i + 2*S, i + 3*S, i + 3*S, i + 3*S].  The step is the same uniform
7589      [VF*S, VF*S, VF*S, VF*S] for all.  */
7590   if (slp_node)
7591     {
7592       /* Enforced above.  */
7593       unsigned int const_nunits = nunits.to_constant ();
7594
7595       /* Generate [VF*S, VF*S, ... ].  */
7596       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7597         {
7598           expr = build_int_cst (integer_type_node, vf);
7599           expr = fold_convert (TREE_TYPE (step_expr), expr);
7600         }
7601       else
7602         expr = build_int_cst (TREE_TYPE (step_expr), vf);
7603       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7604                               expr, step_expr);
7605       if (! CONSTANT_CLASS_P (new_name))
7606         new_name = vect_init_vector (stmt_info, new_name,
7607                                      TREE_TYPE (step_expr), NULL);
7608       new_vec = build_vector_from_val (vectype, new_name);
7609       vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7610
7611       /* Now generate the IVs.  */
7612       unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7613       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7614       unsigned elts = const_nunits * nvects;
7615       unsigned nivs = least_common_multiple (group_size,
7616                                              const_nunits) / const_nunits;
7617       gcc_assert (elts % group_size == 0);
7618       tree elt = init_expr;
7619       unsigned ivn;
7620       for (ivn = 0; ivn < nivs; ++ivn)
7621         {
7622           tree_vector_builder elts (vectype, const_nunits, 1);
7623           stmts = NULL;
7624           for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
7625             {
7626               if (ivn*const_nunits + eltn >= group_size
7627                   && (ivn * const_nunits + eltn) % group_size == 0)
7628                 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
7629                                     elt, step_expr);
7630               elts.quick_push (elt);
7631             }
7632           vec_init = gimple_build_vector (&stmts, &elts);
7633           if (stmts)
7634             {
7635               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7636               gcc_assert (!new_bb);
7637             }
7638
7639           /* Create the induction-phi that defines the induction-operand.  */
7640           vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7641           induction_phi = create_phi_node (vec_dest, iv_loop->header);
7642           stmt_vec_info induction_phi_info
7643             = loop_vinfo->add_stmt (induction_phi);
7644           induc_def = PHI_RESULT (induction_phi);
7645
7646           /* Create the iv update inside the loop  */
7647           vec_def = make_ssa_name (vec_dest);
7648           new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7649           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7650           loop_vinfo->add_stmt (new_stmt);
7651
7652           /* Set the arguments of the phi node:  */
7653           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7654           add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7655                        UNKNOWN_LOCATION);
7656
7657           SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi_info);
7658         }
7659
7660       /* Re-use IVs when we can.  */
7661       if (ivn < nvects)
7662         {
7663           unsigned vfp
7664             = least_common_multiple (group_size, const_nunits) / group_size;
7665           /* Generate [VF'*S, VF'*S, ... ].  */
7666           if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7667             {
7668               expr = build_int_cst (integer_type_node, vfp);
7669               expr = fold_convert (TREE_TYPE (step_expr), expr);
7670             }
7671           else
7672             expr = build_int_cst (TREE_TYPE (step_expr), vfp);
7673           new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7674                                   expr, step_expr);
7675           if (! CONSTANT_CLASS_P (new_name))
7676             new_name = vect_init_vector (stmt_info, new_name,
7677                                          TREE_TYPE (step_expr), NULL);
7678           new_vec = build_vector_from_val (vectype, new_name);
7679           vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7680           for (; ivn < nvects; ++ivn)
7681             {
7682               gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs]->stmt;
7683               tree def;
7684               if (gimple_code (iv) == GIMPLE_PHI)
7685                 def = gimple_phi_result (iv);
7686               else
7687                 def = gimple_assign_lhs (iv);
7688               new_stmt = gimple_build_assign (make_ssa_name (vectype),
7689                                               PLUS_EXPR,
7690                                               def, vec_step);
7691               if (gimple_code (iv) == GIMPLE_PHI)
7692                 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7693               else
7694                 {
7695                   gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
7696                   gsi_insert_after (&tgsi, new_stmt, GSI_CONTINUE_LINKING);
7697                 }
7698               SLP_TREE_VEC_STMTS (slp_node).quick_push
7699                 (loop_vinfo->add_stmt (new_stmt));
7700             }
7701         }
7702
7703       return true;
7704     }
7705
7706   /* Create the vector that holds the initial_value of the induction.  */
7707   if (nested_in_vect_loop)
7708     {
7709       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
7710          been created during vectorization of previous stmts.  We obtain it
7711          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
7712       vec_init = vect_get_vec_def_for_operand (init_expr, stmt_info);
7713       /* If the initial value is not of proper type, convert it.  */
7714       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
7715         {
7716           new_stmt
7717             = gimple_build_assign (vect_get_new_ssa_name (vectype,
7718                                                           vect_simple_var,
7719                                                           "vec_iv_"),
7720                                    VIEW_CONVERT_EXPR,
7721                                    build1 (VIEW_CONVERT_EXPR, vectype,
7722                                            vec_init));
7723           vec_init = gimple_assign_lhs (new_stmt);
7724           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
7725                                                  new_stmt);
7726           gcc_assert (!new_bb);
7727           loop_vinfo->add_stmt (new_stmt);
7728         }
7729     }
7730   else
7731     {
7732       /* iv_loop is the loop to be vectorized. Create:
7733          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
7734       stmts = NULL;
7735       new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
7736
7737       unsigned HOST_WIDE_INT const_nunits;
7738       if (nunits.is_constant (&const_nunits))
7739         {
7740           tree_vector_builder elts (vectype, const_nunits, 1);
7741           elts.quick_push (new_name);
7742           for (i = 1; i < const_nunits; i++)
7743             {
7744               /* Create: new_name_i = new_name + step_expr  */
7745               new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
7746                                        new_name, step_expr);
7747               elts.quick_push (new_name);
7748             }
7749           /* Create a vector from [new_name_0, new_name_1, ...,
7750              new_name_nunits-1]  */
7751           vec_init = gimple_build_vector (&stmts, &elts);
7752         }
7753       else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
7754         /* Build the initial value directly from a VEC_SERIES_EXPR.  */
7755         vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, vectype,
7756                                  new_name, step_expr);
7757       else
7758         {
7759           /* Build:
7760                 [base, base, base, ...]
7761                 + (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
7762           gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
7763           gcc_assert (flag_associative_math);
7764           tree index = build_index_vector (vectype, 0, 1);
7765           tree base_vec = gimple_build_vector_from_val (&stmts, vectype,
7766                                                         new_name);
7767           tree step_vec = gimple_build_vector_from_val (&stmts, vectype,
7768                                                         step_expr);
7769           vec_init = gimple_build (&stmts, FLOAT_EXPR, vectype, index);
7770           vec_init = gimple_build (&stmts, MULT_EXPR, vectype,
7771                                    vec_init, step_vec);
7772           vec_init = gimple_build (&stmts, PLUS_EXPR, vectype,
7773                                    vec_init, base_vec);
7774         }
7775
7776       if (stmts)
7777         {
7778           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7779           gcc_assert (!new_bb);
7780         }
7781     }
7782
7783
7784   /* Create the vector that holds the step of the induction.  */
7785   if (nested_in_vect_loop)
7786     /* iv_loop is nested in the loop to be vectorized. Generate:
7787        vec_step = [S, S, S, S]  */
7788     new_name = step_expr;
7789   else
7790     {
7791       /* iv_loop is the loop to be vectorized. Generate:
7792           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
7793       gimple_seq seq = NULL;
7794       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7795         {
7796           expr = build_int_cst (integer_type_node, vf);
7797           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7798         }
7799       else
7800         expr = build_int_cst (TREE_TYPE (step_expr), vf);
7801       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7802                                expr, step_expr);
7803       if (seq)
7804         {
7805           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7806           gcc_assert (!new_bb);
7807         }
7808     }
7809
7810   t = unshare_expr (new_name);
7811   gcc_assert (CONSTANT_CLASS_P (new_name)
7812               || TREE_CODE (new_name) == SSA_NAME);
7813   new_vec = build_vector_from_val (vectype, t);
7814   vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7815
7816
7817   /* Create the following def-use cycle:
7818      loop prolog:
7819          vec_init = ...
7820          vec_step = ...
7821      loop:
7822          vec_iv = PHI <vec_init, vec_loop>
7823          ...
7824          STMT
7825          ...
7826          vec_loop = vec_iv + vec_step;  */
7827
7828   /* Create the induction-phi that defines the induction-operand.  */
7829   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7830   induction_phi = create_phi_node (vec_dest, iv_loop->header);
7831   stmt_vec_info induction_phi_info = loop_vinfo->add_stmt (induction_phi);
7832   induc_def = PHI_RESULT (induction_phi);
7833
7834   /* Create the iv update inside the loop  */
7835   vec_def = make_ssa_name (vec_dest);
7836   new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7837   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7838   stmt_vec_info new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7839
7840   /* Set the arguments of the phi node:  */
7841   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7842   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7843                UNKNOWN_LOCATION);
7844
7845   STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi_info;
7846
7847   /* In case that vectorization factor (VF) is bigger than the number
7848      of elements that we can fit in a vectype (nunits), we have to generate
7849      more than one vector stmt - i.e - we need to "unroll" the
7850      vector stmt by a factor VF/nunits.  For more details see documentation
7851      in vectorizable_operation.  */
7852
7853   if (ncopies > 1)
7854     {
7855       gimple_seq seq = NULL;
7856       stmt_vec_info prev_stmt_vinfo;
7857       /* FORNOW. This restriction should be relaxed.  */
7858       gcc_assert (!nested_in_vect_loop);
7859
7860       /* Create the vector that holds the step of the induction.  */
7861       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7862         {
7863           expr = build_int_cst (integer_type_node, nunits);
7864           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7865         }
7866       else
7867         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
7868       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7869                                expr, step_expr);
7870       if (seq)
7871         {
7872           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7873           gcc_assert (!new_bb);
7874         }
7875
7876       t = unshare_expr (new_name);
7877       gcc_assert (CONSTANT_CLASS_P (new_name)
7878                   || TREE_CODE (new_name) == SSA_NAME);
7879       new_vec = build_vector_from_val (vectype, t);
7880       vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7881
7882       vec_def = induc_def;
7883       prev_stmt_vinfo = induction_phi_info;
7884       for (i = 1; i < ncopies; i++)
7885         {
7886           /* vec_i = vec_prev + vec_step  */
7887           new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
7888                                           vec_def, vec_step);
7889           vec_def = make_ssa_name (vec_dest, new_stmt);
7890           gimple_assign_set_lhs (new_stmt, vec_def);
7891
7892           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7893           new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7894           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt_info;
7895           prev_stmt_vinfo = new_stmt_info;
7896         }
7897     }
7898
7899   if (nested_in_vect_loop)
7900     {
7901       /* Find the loop-closed exit-phi of the induction, and record
7902          the final vector of induction results:  */
7903       exit_phi = NULL;
7904       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7905         {
7906           gimple *use_stmt = USE_STMT (use_p);
7907           if (is_gimple_debug (use_stmt))
7908             continue;
7909
7910           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
7911             {
7912               exit_phi = use_stmt;
7913               break;
7914             }
7915         }
7916       if (exit_phi)
7917         {
7918           stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7919           /* FORNOW. Currently not supporting the case that an inner-loop induction
7920              is not used in the outer-loop (i.e. only outside the outer-loop).  */
7921           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
7922                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
7923
7924           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt_info;
7925           if (dump_enabled_p ())
7926             dump_printf_loc (MSG_NOTE, vect_location,
7927                              "vector of inductions after inner-loop:%G",
7928                              new_stmt);
7929         }
7930     }
7931
7932
7933   if (dump_enabled_p ())
7934     dump_printf_loc (MSG_NOTE, vect_location,
7935                      "transform induction: created def-use cycle: %G%G",
7936                      induction_phi, SSA_NAME_DEF_STMT (vec_def));
7937
7938   return true;
7939 }
7940
7941 /* Function vectorizable_live_operation.
7942
7943    STMT_INFO computes a value that is used outside the loop.  Check if
7944    it can be supported.  */
7945
7946 bool
7947 vectorizable_live_operation (stmt_vec_info stmt_info,
7948                              gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7949                              slp_tree slp_node, int slp_index,
7950                              stmt_vec_info *vec_stmt,
7951                              stmt_vector_for_cost *)
7952 {
7953   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7954   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7955   imm_use_iterator imm_iter;
7956   tree lhs, lhs_type, bitsize, vec_bitsize;
7957   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7958   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7959   int ncopies;
7960   gimple *use_stmt;
7961   auto_vec<tree> vec_oprnds;
7962   int vec_entry = 0;
7963   poly_uint64 vec_index = 0;
7964
7965   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
7966
7967   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
7968     return false;
7969
7970   /* FORNOW.  CHECKME.  */
7971   if (nested_in_vect_loop_p (loop, stmt_info))
7972     return false;
7973
7974   /* If STMT is not relevant and it is a simple assignment and its inputs are
7975      invariant then it can remain in place, unvectorized.  The original last
7976      scalar value that it computes will be used.  */
7977   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7978     {
7979       gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
7980       if (dump_enabled_p ())
7981         dump_printf_loc (MSG_NOTE, vect_location,
7982                          "statement is simple and uses invariant.  Leaving in "
7983                          "place.\n");
7984       return true;
7985     }
7986
7987   if (slp_node)
7988     ncopies = 1;
7989   else
7990     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7991
7992   if (slp_node)
7993     {
7994       gcc_assert (slp_index >= 0);
7995
7996       int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7997       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7998
7999       /* Get the last occurrence of the scalar index from the concatenation of
8000          all the slp vectors. Calculate which slp vector it is and the index
8001          within.  */
8002       poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
8003
8004       /* Calculate which vector contains the result, and which lane of
8005          that vector we need.  */
8006       if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
8007         {
8008           if (dump_enabled_p ())
8009             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8010                              "Cannot determine which vector holds the"
8011                              " final result.\n");
8012           return false;
8013         }
8014     }
8015
8016   if (!vec_stmt)
8017     {
8018       /* No transformation required.  */
8019       if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
8020         {
8021           if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
8022                                                OPTIMIZE_FOR_SPEED))
8023             {
8024               if (dump_enabled_p ())
8025                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8026                                  "can't use a fully-masked loop because "
8027                                  "the target doesn't support extract last "
8028                                  "reduction.\n");
8029               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
8030             }
8031           else if (slp_node)
8032             {
8033               if (dump_enabled_p ())
8034                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8035                                  "can't use a fully-masked loop because an "
8036                                  "SLP statement is live after the loop.\n");
8037               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
8038             }
8039           else if (ncopies > 1)
8040             {
8041               if (dump_enabled_p ())
8042                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8043                                  "can't use a fully-masked loop because"
8044                                  " ncopies is greater than 1.\n");
8045               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
8046             }
8047           else
8048             {
8049               gcc_assert (ncopies == 1 && !slp_node);
8050               vect_record_loop_mask (loop_vinfo,
8051                                      &LOOP_VINFO_MASKS (loop_vinfo),
8052                                      1, vectype);
8053             }
8054         }
8055       return true;
8056     }
8057
8058   /* Use the lhs of the original scalar statement.  */
8059   gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
8060
8061   lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
8062         : gimple_get_lhs (stmt);
8063   lhs_type = TREE_TYPE (lhs);
8064
8065   bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype)
8066              ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype)))
8067              : TYPE_SIZE (TREE_TYPE (vectype)));
8068   vec_bitsize = TYPE_SIZE (vectype);
8069
8070   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
8071   tree vec_lhs, bitstart;
8072   if (slp_node)
8073     {
8074       gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8075
8076       /* Get the correct slp vectorized stmt.  */
8077       gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry]->stmt;
8078       if (gphi *phi = dyn_cast <gphi *> (vec_stmt))
8079         vec_lhs = gimple_phi_result (phi);
8080       else
8081         vec_lhs = gimple_get_lhs (vec_stmt);
8082
8083       /* Get entry to use.  */
8084       bitstart = bitsize_int (vec_index);
8085       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
8086     }
8087   else
8088     {
8089       enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
8090       vec_lhs = vect_get_vec_def_for_operand_1 (stmt_info, dt);
8091       gcc_checking_assert (ncopies == 1
8092                            || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8093
8094       /* For multiple copies, get the last copy.  */
8095       for (int i = 1; i < ncopies; ++i)
8096         vec_lhs = vect_get_vec_def_for_stmt_copy (loop_vinfo, vec_lhs);
8097
8098       /* Get the last lane in the vector.  */
8099       bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
8100     }
8101
8102   gimple_seq stmts = NULL;
8103   tree new_tree;
8104   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8105     {
8106       /* Emit:
8107
8108            SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
8109
8110          where VEC_LHS is the vectorized live-out result and MASK is
8111          the loop mask for the final iteration.  */
8112       gcc_assert (ncopies == 1 && !slp_node);
8113       tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
8114       tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
8115                                       1, vectype, 0);
8116       tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST,
8117                                       scalar_type, mask, vec_lhs);
8118
8119       /* Convert the extracted vector element to the required scalar type.  */
8120       new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
8121     }
8122   else
8123     {
8124       tree bftype = TREE_TYPE (vectype);
8125       if (VECTOR_BOOLEAN_TYPE_P (vectype))
8126         bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8127       new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs, bitsize, bitstart);
8128       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8129                                        &stmts, true, NULL_TREE);
8130     }
8131
8132   if (stmts)
8133     gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts);
8134
8135   /* Replace use of lhs with newly computed result.  If the use stmt is a
8136      single arg PHI, just replace all uses of PHI result.  It's necessary
8137      because lcssa PHI defining lhs may be before newly inserted stmt.  */
8138   use_operand_p use_p;
8139   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8140     if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
8141         && !is_gimple_debug (use_stmt))
8142     {
8143       if (gimple_code (use_stmt) == GIMPLE_PHI
8144           && gimple_phi_num_args (use_stmt) == 1)
8145         {
8146           replace_uses_by (gimple_phi_result (use_stmt), new_tree);
8147         }
8148       else
8149         {
8150           FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8151             SET_USE (use_p, new_tree);
8152         }
8153       update_stmt (use_stmt);
8154     }
8155
8156   return true;
8157 }
8158
8159 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO.  */
8160
8161 static void
8162 vect_loop_kill_debug_uses (struct loop *loop, stmt_vec_info stmt_info)
8163 {
8164   ssa_op_iter op_iter;
8165   imm_use_iterator imm_iter;
8166   def_operand_p def_p;
8167   gimple *ustmt;
8168
8169   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
8170     {
8171       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
8172         {
8173           basic_block bb;
8174
8175           if (!is_gimple_debug (ustmt))
8176             continue;
8177
8178           bb = gimple_bb (ustmt);
8179
8180           if (!flow_bb_inside_loop_p (loop, bb))
8181             {
8182               if (gimple_debug_bind_p (ustmt))
8183                 {
8184                   if (dump_enabled_p ())
8185                     dump_printf_loc (MSG_NOTE, vect_location,
8186                                      "killing debug use\n");
8187
8188                   gimple_debug_bind_reset_value (ustmt);
8189                   update_stmt (ustmt);
8190                 }
8191               else
8192                 gcc_unreachable ();
8193             }
8194         }
8195     }
8196 }
8197
8198 /* Given loop represented by LOOP_VINFO, return true if computation of
8199    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
8200    otherwise.  */
8201
8202 static bool
8203 loop_niters_no_overflow (loop_vec_info loop_vinfo)
8204 {
8205   /* Constant case.  */
8206   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8207     {
8208       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
8209       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
8210
8211       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
8212       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
8213       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
8214         return true;
8215     }
8216
8217   widest_int max;
8218   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8219   /* Check the upper bound of loop niters.  */
8220   if (get_max_loop_iterations (loop, &max))
8221     {
8222       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
8223       signop sgn = TYPE_SIGN (type);
8224       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
8225       if (max < type_max)
8226         return true;
8227     }
8228   return false;
8229 }
8230
8231 /* Return a mask type with half the number of elements as TYPE.  */
8232
8233 tree
8234 vect_halve_mask_nunits (tree type)
8235 {
8236   poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (type), 2);
8237   return build_truth_vector_type (nunits, current_vector_size);
8238 }
8239
8240 /* Return a mask type with twice as many elements as TYPE.  */
8241
8242 tree
8243 vect_double_mask_nunits (tree type)
8244 {
8245   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (type) * 2;
8246   return build_truth_vector_type (nunits, current_vector_size);
8247 }
8248
8249 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8250    contain a sequence of NVECTORS masks that each control a vector of type
8251    VECTYPE.  */
8252
8253 void
8254 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
8255                        unsigned int nvectors, tree vectype)
8256 {
8257   gcc_assert (nvectors != 0);
8258   if (masks->length () < nvectors)
8259     masks->safe_grow_cleared (nvectors);
8260   rgroup_masks *rgm = &(*masks)[nvectors - 1];
8261   /* The number of scalars per iteration and the number of vectors are
8262      both compile-time constants.  */
8263   unsigned int nscalars_per_iter
8264     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8265                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8266   if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8267     {
8268       rgm->max_nscalars_per_iter = nscalars_per_iter;
8269       rgm->mask_type = build_same_sized_truth_vector_type (vectype);
8270     }
8271 }
8272
8273 /* Given a complete set of masks MASKS, extract mask number INDEX
8274    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8275    where 0 <= INDEX < NVECTORS.  Insert any set-up statements before GSI.
8276
8277    See the comment above vec_loop_masks for more details about the mask
8278    arrangement.  */
8279
8280 tree
8281 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
8282                     unsigned int nvectors, tree vectype, unsigned int index)
8283 {
8284   rgroup_masks *rgm = &(*masks)[nvectors - 1];
8285   tree mask_type = rgm->mask_type;
8286
8287   /* Populate the rgroup's mask array, if this is the first time we've
8288      used it.  */
8289   if (rgm->masks.is_empty ())
8290     {
8291       rgm->masks.safe_grow_cleared (nvectors);
8292       for (unsigned int i = 0; i < nvectors; ++i)
8293         {
8294           tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
8295           /* Provide a dummy definition until the real one is available.  */
8296           SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
8297           rgm->masks[i] = mask;
8298         }
8299     }
8300
8301   tree mask = rgm->masks[index];
8302   if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
8303                 TYPE_VECTOR_SUBPARTS (vectype)))
8304     {
8305       /* A loop mask for data type X can be reused for data type Y
8306          if X has N times more elements than Y and if Y's elements
8307          are N times bigger than X's.  In this case each sequence
8308          of N elements in the loop mask will be all-zero or all-one.
8309          We can then view-convert the mask so that each sequence of
8310          N elements is replaced by a single element.  */
8311       gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
8312                               TYPE_VECTOR_SUBPARTS (vectype)));
8313       gimple_seq seq = NULL;
8314       mask_type = build_same_sized_truth_vector_type (vectype);
8315       mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
8316       if (seq)
8317         gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
8318     }
8319   return mask;
8320 }
8321
8322 /* Scale profiling counters by estimation for LOOP which is vectorized
8323    by factor VF.  */
8324
8325 static void
8326 scale_profile_for_vect_loop (struct loop *loop, unsigned vf)
8327 {
8328   edge preheader = loop_preheader_edge (loop);
8329   /* Reduce loop iterations by the vectorization factor.  */
8330   gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
8331   profile_count freq_h = loop->header->count, freq_e = preheader->count ();
8332
8333   if (freq_h.nonzero_p ())
8334     {
8335       profile_probability p;
8336
8337       /* Avoid dropping loop body profile counter to 0 because of zero count
8338          in loop's preheader.  */
8339       if (!(freq_e == profile_count::zero ()))
8340         freq_e = freq_e.force_nonzero ();
8341       p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
8342       scale_loop_frequencies (loop, p);
8343     }
8344
8345   edge exit_e = single_exit (loop);
8346   exit_e->probability = profile_probability::always ()
8347                                  .apply_scale (1, new_est_niter + 1);
8348
8349   edge exit_l = single_pred_edge (loop->latch);
8350   profile_probability prob = exit_l->probability;
8351   exit_l->probability = exit_e->probability.invert ();
8352   if (prob.initialized_p () && exit_l->probability.initialized_p ())
8353     scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
8354 }
8355
8356 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
8357    When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
8358    stmt_vec_info.  */
8359
8360 static void
8361 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8362                           gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
8363 {
8364   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8365   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8366
8367   if (dump_enabled_p ())
8368     dump_printf_loc (MSG_NOTE, vect_location,
8369                      "------>vectorizing statement: %G", stmt_info->stmt);
8370
8371   if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8372     vect_loop_kill_debug_uses (loop, stmt_info);
8373
8374   if (!STMT_VINFO_RELEVANT_P (stmt_info)
8375       && !STMT_VINFO_LIVE_P (stmt_info))
8376     return;
8377
8378   if (STMT_VINFO_VECTYPE (stmt_info))
8379     {
8380       poly_uint64 nunits
8381         = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8382       if (!STMT_SLP_TYPE (stmt_info)
8383           && maybe_ne (nunits, vf)
8384           && dump_enabled_p ())
8385         /* For SLP VF is set according to unrolling factor, and not
8386            to vector size, hence for SLP this print is not valid.  */
8387         dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8388     }
8389
8390   /* Pure SLP statements have already been vectorized.  We still need
8391      to apply loop vectorization to hybrid SLP statements.  */
8392   if (PURE_SLP_STMT (stmt_info))
8393     return;
8394
8395   if (dump_enabled_p ())
8396     dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
8397
8398   if (vect_transform_stmt (stmt_info, gsi, NULL, NULL))
8399     *seen_store = stmt_info;
8400 }
8401
8402 /* Function vect_transform_loop.
8403
8404    The analysis phase has determined that the loop is vectorizable.
8405    Vectorize the loop - created vectorized stmts to replace the scalar
8406    stmts in the loop, and update the loop exit condition.
8407    Returns scalar epilogue loop if any.  */
8408
8409 struct loop *
8410 vect_transform_loop (loop_vec_info loop_vinfo)
8411 {
8412   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8413   struct loop *epilogue = NULL;
8414   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8415   int nbbs = loop->num_nodes;
8416   int i;
8417   tree niters_vector = NULL_TREE;
8418   tree step_vector = NULL_TREE;
8419   tree niters_vector_mult_vf = NULL_TREE;
8420   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8421   unsigned int lowest_vf = constant_lower_bound (vf);
8422   gimple *stmt;
8423   bool check_profitability = false;
8424   unsigned int th;
8425
8426   DUMP_VECT_SCOPE ("vec_transform_loop");
8427
8428   loop_vinfo->shared->check_datarefs ();
8429
8430   /* Use the more conservative vectorization threshold.  If the number
8431      of iterations is constant assume the cost check has been performed
8432      by our caller.  If the threshold makes all loops profitable that
8433      run at least the (estimated) vectorization factor number of times
8434      checking is pointless, too.  */
8435   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
8436   if (th >= vect_vf_for_cost (loop_vinfo)
8437       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8438     {
8439       if (dump_enabled_p ())
8440         dump_printf_loc (MSG_NOTE, vect_location,
8441                          "Profitability threshold is %d loop iterations.\n",
8442                          th);
8443       check_profitability = true;
8444     }
8445
8446   /* Make sure there exists a single-predecessor exit bb.  Do this before
8447      versioning.   */
8448   edge e = single_exit (loop);
8449   if (! single_pred_p (e->dest))
8450     {
8451       split_loop_exit_edge (e, true);
8452       if (dump_enabled_p ())
8453         dump_printf (MSG_NOTE, "split exit edge\n");
8454     }
8455
8456   /* Version the loop first, if required, so the profitability check
8457      comes first.  */
8458
8459   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
8460     {
8461       poly_uint64 versioning_threshold
8462         = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
8463       if (check_profitability
8464           && ordered_p (poly_uint64 (th), versioning_threshold))
8465         {
8466           versioning_threshold = ordered_max (poly_uint64 (th),
8467                                               versioning_threshold);
8468           check_profitability = false;
8469         }
8470       struct loop *sloop
8471         = vect_loop_versioning (loop_vinfo, th, check_profitability,
8472                                 versioning_threshold);
8473       sloop->force_vectorize = false;
8474       check_profitability = false;
8475     }
8476
8477   /* Make sure there exists a single-predecessor exit bb also on the
8478      scalar loop copy.  Do this after versioning but before peeling
8479      so CFG structure is fine for both scalar and if-converted loop
8480      to make slpeel_duplicate_current_defs_from_edges face matched
8481      loop closed PHI nodes on the exit.  */
8482   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8483     {
8484       e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
8485       if (! single_pred_p (e->dest))
8486         {
8487           split_loop_exit_edge (e, true);
8488           if (dump_enabled_p ())
8489             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
8490         }
8491     }
8492
8493   tree niters = vect_build_loop_niters (loop_vinfo);
8494   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
8495   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
8496   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
8497   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
8498                               &step_vector, &niters_vector_mult_vf, th,
8499                               check_profitability, niters_no_overflow);
8500
8501   if (niters_vector == NULL_TREE)
8502     {
8503       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8504           && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8505           && known_eq (lowest_vf, vf))
8506         {
8507           niters_vector
8508             = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
8509                              LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
8510           step_vector = build_one_cst (TREE_TYPE (niters));
8511         }
8512       else
8513         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
8514                                      &step_vector, niters_no_overflow);
8515     }
8516
8517   /* 1) Make sure the loop header has exactly two entries
8518      2) Make sure we have a preheader basic block.  */
8519
8520   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
8521
8522   split_edge (loop_preheader_edge (loop));
8523
8524   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8525       && vect_use_loop_mask_for_alignment_p (loop_vinfo))
8526     /* This will deal with any possible peeling.  */
8527     vect_prepare_for_masked_peels (loop_vinfo);
8528
8529   /* Schedule the SLP instances first, then handle loop vectorization
8530      below.  */
8531   if (!loop_vinfo->slp_instances.is_empty ())
8532     {
8533       DUMP_VECT_SCOPE ("scheduling SLP instances");
8534       vect_schedule_slp (loop_vinfo);
8535     }
8536
8537   /* FORNOW: the vectorizer supports only loops which body consist
8538      of one basic block (header + empty latch). When the vectorizer will
8539      support more involved loop forms, the order by which the BBs are
8540      traversed need to be reconsidered.  */
8541
8542   for (i = 0; i < nbbs; i++)
8543     {
8544       basic_block bb = bbs[i];
8545       stmt_vec_info stmt_info;
8546
8547       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
8548            gsi_next (&si))
8549         {
8550           gphi *phi = si.phi ();
8551           if (dump_enabled_p ())
8552             dump_printf_loc (MSG_NOTE, vect_location,
8553                              "------>vectorizing phi: %G", phi);
8554           stmt_info = loop_vinfo->lookup_stmt (phi);
8555           if (!stmt_info)
8556             continue;
8557
8558           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8559             vect_loop_kill_debug_uses (loop, stmt_info);
8560
8561           if (!STMT_VINFO_RELEVANT_P (stmt_info)
8562               && !STMT_VINFO_LIVE_P (stmt_info))
8563             continue;
8564
8565           if (STMT_VINFO_VECTYPE (stmt_info)
8566               && (maybe_ne
8567                   (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
8568               && dump_enabled_p ())
8569             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8570
8571           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
8572                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8573                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
8574               && ! PURE_SLP_STMT (stmt_info))
8575             {
8576               if (dump_enabled_p ())
8577                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
8578               vect_transform_stmt (stmt_info, NULL, NULL, NULL);
8579             }
8580         }
8581
8582       for (gimple_stmt_iterator si = gsi_start_bb (bb);
8583            !gsi_end_p (si);)
8584         {
8585           stmt = gsi_stmt (si);
8586           /* During vectorization remove existing clobber stmts.  */
8587           if (gimple_clobber_p (stmt))
8588             {
8589               unlink_stmt_vdef (stmt);
8590               gsi_remove (&si, true);
8591               release_defs (stmt);
8592             }
8593           else
8594             {
8595               stmt_info = loop_vinfo->lookup_stmt (stmt);
8596
8597               /* vector stmts created in the outer-loop during vectorization of
8598                  stmts in an inner-loop may not have a stmt_info, and do not
8599                  need to be vectorized.  */
8600               stmt_vec_info seen_store = NULL;
8601               if (stmt_info)
8602                 {
8603                   if (STMT_VINFO_IN_PATTERN_P (stmt_info))
8604                     {
8605                       gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
8606                       for (gimple_stmt_iterator subsi = gsi_start (def_seq);
8607                            !gsi_end_p (subsi); gsi_next (&subsi))
8608                         {
8609                           stmt_vec_info pat_stmt_info
8610                             = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
8611                           vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
8612                                                     &si, &seen_store);
8613                         }
8614                       stmt_vec_info pat_stmt_info
8615                         = STMT_VINFO_RELATED_STMT (stmt_info);
8616                       vect_transform_loop_stmt (loop_vinfo, pat_stmt_info, &si,
8617                                                 &seen_store);
8618                     }
8619                   vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
8620                                             &seen_store);
8621                 }
8622               gsi_next (&si);
8623               if (seen_store)
8624                 {
8625                   if (STMT_VINFO_GROUPED_ACCESS (seen_store))
8626                     /* Interleaving.  If IS_STORE is TRUE, the
8627                        vectorization of the interleaving chain was
8628                        completed - free all the stores in the chain.  */
8629                     vect_remove_stores (DR_GROUP_FIRST_ELEMENT (seen_store));
8630                   else
8631                     /* Free the attached stmt_vec_info and remove the stmt.  */
8632                     loop_vinfo->remove_stmt (stmt_info);
8633                 }
8634             }
8635         }
8636
8637       /* Stub out scalar statements that must not survive vectorization.
8638          Doing this here helps with grouped statements, or statements that
8639          are involved in patterns.  */
8640       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
8641            !gsi_end_p (gsi); gsi_next (&gsi))
8642         {
8643           gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
8644           if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
8645             {
8646               tree lhs = gimple_get_lhs (call);
8647               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8648                 {
8649                   tree zero = build_zero_cst (TREE_TYPE (lhs));
8650                   gimple *new_stmt = gimple_build_assign (lhs, zero);
8651                   gsi_replace (&gsi, new_stmt, true);
8652                 }
8653             }
8654         }
8655     }                           /* BBs in loop */
8656
8657   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
8658      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
8659   if (integer_onep (step_vector))
8660     niters_no_overflow = true;
8661   vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
8662                            niters_vector_mult_vf, !niters_no_overflow);
8663
8664   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
8665   scale_profile_for_vect_loop (loop, assumed_vf);
8666
8667   /* True if the final iteration might not handle a full vector's
8668      worth of scalar iterations.  */
8669   bool final_iter_may_be_partial = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8670   /* The minimum number of iterations performed by the epilogue.  This
8671      is 1 when peeling for gaps because we always need a final scalar
8672      iteration.  */
8673   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
8674   /* +1 to convert latch counts to loop iteration counts,
8675      -min_epilogue_iters to remove iterations that cannot be performed
8676        by the vector code.  */
8677   int bias_for_lowest = 1 - min_epilogue_iters;
8678   int bias_for_assumed = bias_for_lowest;
8679   int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
8680   if (alignment_npeels && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8681     {
8682       /* When the amount of peeling is known at compile time, the first
8683          iteration will have exactly alignment_npeels active elements.
8684          In the worst case it will have at least one.  */
8685       int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
8686       bias_for_lowest += lowest_vf - min_first_active;
8687       bias_for_assumed += assumed_vf - min_first_active;
8688     }
8689   /* In these calculations the "- 1" converts loop iteration counts
8690      back to latch counts.  */
8691   if (loop->any_upper_bound)
8692     loop->nb_iterations_upper_bound
8693       = (final_iter_may_be_partial
8694          ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
8695                           lowest_vf) - 1
8696          : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
8697                            lowest_vf) - 1);
8698   if (loop->any_likely_upper_bound)
8699     loop->nb_iterations_likely_upper_bound
8700       = (final_iter_may_be_partial
8701          ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
8702                           + bias_for_lowest, lowest_vf) - 1
8703          : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
8704                            + bias_for_lowest, lowest_vf) - 1);
8705   if (loop->any_estimate)
8706     loop->nb_iterations_estimate
8707       = (final_iter_may_be_partial
8708          ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
8709                           assumed_vf) - 1
8710          : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
8711                            assumed_vf) - 1);
8712
8713   if (dump_enabled_p ())
8714     {
8715       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8716         {
8717           dump_printf_loc (MSG_NOTE, vect_location,
8718                            "LOOP VECTORIZED\n");
8719           if (loop->inner)
8720             dump_printf_loc (MSG_NOTE, vect_location,
8721                              "OUTER LOOP VECTORIZED\n");
8722           dump_printf (MSG_NOTE, "\n");
8723         }
8724       else
8725         {
8726           dump_printf_loc (MSG_NOTE, vect_location,
8727                            "LOOP EPILOGUE VECTORIZED (VS=");
8728           dump_dec (MSG_NOTE, current_vector_size);
8729           dump_printf (MSG_NOTE, ")\n");
8730         }
8731     }
8732
8733   /* Loops vectorized with a variable factor won't benefit from
8734      unrolling/peeling.  */
8735   if (!vf.is_constant ())
8736     {
8737       loop->unroll = 1;
8738       if (dump_enabled_p ())
8739         dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
8740                          " variable-length vectorization factor\n");
8741     }
8742   /* Free SLP instances here because otherwise stmt reference counting
8743      won't work.  */
8744   slp_instance instance;
8745   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
8746     vect_free_slp_instance (instance, true);
8747   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
8748   /* Clear-up safelen field since its value is invalid after vectorization
8749      since vectorized loop can have loop-carried dependencies.  */
8750   loop->safelen = 0;
8751
8752   /* Don't vectorize epilogue for epilogue.  */
8753   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8754     epilogue = NULL;
8755
8756   if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
8757     epilogue = NULL;
8758
8759   if (epilogue)
8760     {
8761       auto_vector_sizes vector_sizes;
8762       targetm.vectorize.autovectorize_vector_sizes (&vector_sizes, false);
8763       unsigned int next_size = 0;
8764
8765       /* Note LOOP_VINFO_NITERS_KNOWN_P and LOOP_VINFO_INT_NITERS work
8766          on niters already ajusted for the iterations of the prologue.  */
8767       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8768           && known_eq (vf, lowest_vf))
8769         {
8770           unsigned HOST_WIDE_INT eiters
8771             = (LOOP_VINFO_INT_NITERS (loop_vinfo)
8772                - LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
8773           eiters
8774             = eiters % lowest_vf + LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
8775           epilogue->nb_iterations_upper_bound = eiters - 1;
8776           epilogue->any_upper_bound = true;
8777
8778           unsigned int ratio;
8779           while (next_size < vector_sizes.length ()
8780                  && !(constant_multiple_p (current_vector_size,
8781                                            vector_sizes[next_size], &ratio)
8782                       && eiters >= lowest_vf / ratio))
8783             next_size += 1;
8784         }
8785       else
8786         while (next_size < vector_sizes.length ()
8787                && maybe_lt (current_vector_size, vector_sizes[next_size]))
8788           next_size += 1;
8789
8790       if (next_size == vector_sizes.length ())
8791         epilogue = NULL;
8792     }
8793
8794   if (epilogue)
8795     {
8796       epilogue->force_vectorize = loop->force_vectorize;
8797       epilogue->safelen = loop->safelen;
8798       epilogue->dont_vectorize = false;
8799
8800       /* We may need to if-convert epilogue to vectorize it.  */
8801       if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8802         tree_if_conversion (epilogue);
8803     }
8804
8805   return epilogue;
8806 }
8807
8808 /* The code below is trying to perform simple optimization - revert
8809    if-conversion for masked stores, i.e. if the mask of a store is zero
8810    do not perform it and all stored value producers also if possible.
8811    For example,
8812      for (i=0; i<n; i++)
8813        if (c[i])
8814         {
8815           p1[i] += 1;
8816           p2[i] = p3[i] +2;
8817         }
8818    this transformation will produce the following semi-hammock:
8819
8820    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
8821      {
8822        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
8823        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
8824        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
8825        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
8826        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
8827        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
8828      }
8829 */
8830
8831 void
8832 optimize_mask_stores (struct loop *loop)
8833 {
8834   basic_block *bbs = get_loop_body (loop);
8835   unsigned nbbs = loop->num_nodes;
8836   unsigned i;
8837   basic_block bb;
8838   struct loop *bb_loop;
8839   gimple_stmt_iterator gsi;
8840   gimple *stmt;
8841   auto_vec<gimple *> worklist;
8842   auto_purge_vect_location sentinel;
8843
8844   vect_location = find_loop_location (loop);
8845   /* Pick up all masked stores in loop if any.  */
8846   for (i = 0; i < nbbs; i++)
8847     {
8848       bb = bbs[i];
8849       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
8850            gsi_next (&gsi))
8851         {
8852           stmt = gsi_stmt (gsi);
8853           if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
8854             worklist.safe_push (stmt);
8855         }
8856     }
8857
8858   free (bbs);
8859   if (worklist.is_empty ())
8860     return;
8861
8862   /* Loop has masked stores.  */
8863   while (!worklist.is_empty ())
8864     {
8865       gimple *last, *last_store;
8866       edge e, efalse;
8867       tree mask;
8868       basic_block store_bb, join_bb;
8869       gimple_stmt_iterator gsi_to;
8870       tree vdef, new_vdef;
8871       gphi *phi;
8872       tree vectype;
8873       tree zero;
8874
8875       last = worklist.pop ();
8876       mask = gimple_call_arg (last, 2);
8877       bb = gimple_bb (last);
8878       /* Create then_bb and if-then structure in CFG, then_bb belongs to
8879          the same loop as if_bb.  It could be different to LOOP when two
8880          level loop-nest is vectorized and mask_store belongs to the inner
8881          one.  */
8882       e = split_block (bb, last);
8883       bb_loop = bb->loop_father;
8884       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
8885       join_bb = e->dest;
8886       store_bb = create_empty_bb (bb);
8887       add_bb_to_loop (store_bb, bb_loop);
8888       e->flags = EDGE_TRUE_VALUE;
8889       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
8890       /* Put STORE_BB to likely part.  */
8891       efalse->probability = profile_probability::unlikely ();
8892       store_bb->count = efalse->count ();
8893       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
8894       if (dom_info_available_p (CDI_DOMINATORS))
8895         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
8896       if (dump_enabled_p ())
8897         dump_printf_loc (MSG_NOTE, vect_location,
8898                          "Create new block %d to sink mask stores.",
8899                          store_bb->index);
8900       /* Create vector comparison with boolean result.  */
8901       vectype = TREE_TYPE (mask);
8902       zero = build_zero_cst (vectype);
8903       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
8904       gsi = gsi_last_bb (bb);
8905       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
8906       /* Create new PHI node for vdef of the last masked store:
8907          .MEM_2 = VDEF <.MEM_1>
8908          will be converted to
8909          .MEM.3 = VDEF <.MEM_1>
8910          and new PHI node will be created in join bb
8911          .MEM_2 = PHI <.MEM_1, .MEM_3>
8912       */
8913       vdef = gimple_vdef (last);
8914       new_vdef = make_ssa_name (gimple_vop (cfun), last);
8915       gimple_set_vdef (last, new_vdef);
8916       phi = create_phi_node (vdef, join_bb);
8917       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
8918
8919       /* Put all masked stores with the same mask to STORE_BB if possible.  */
8920       while (true)
8921         {
8922           gimple_stmt_iterator gsi_from;
8923           gimple *stmt1 = NULL;
8924
8925           /* Move masked store to STORE_BB.  */
8926           last_store = last;
8927           gsi = gsi_for_stmt (last);
8928           gsi_from = gsi;
8929           /* Shift GSI to the previous stmt for further traversal.  */
8930           gsi_prev (&gsi);
8931           gsi_to = gsi_start_bb (store_bb);
8932           gsi_move_before (&gsi_from, &gsi_to);
8933           /* Setup GSI_TO to the non-empty block start.  */
8934           gsi_to = gsi_start_bb (store_bb);
8935           if (dump_enabled_p ())
8936             dump_printf_loc (MSG_NOTE, vect_location,
8937                              "Move stmt to created bb\n%G", last);
8938           /* Move all stored value producers if possible.  */
8939           while (!gsi_end_p (gsi))
8940             {
8941               tree lhs;
8942               imm_use_iterator imm_iter;
8943               use_operand_p use_p;
8944               bool res;
8945
8946               /* Skip debug statements.  */
8947               if (is_gimple_debug (gsi_stmt (gsi)))
8948                 {
8949                   gsi_prev (&gsi);
8950                   continue;
8951                 }
8952               stmt1 = gsi_stmt (gsi);
8953               /* Do not consider statements writing to memory or having
8954                  volatile operand.  */
8955               if (gimple_vdef (stmt1)
8956                   || gimple_has_volatile_ops (stmt1))
8957                 break;
8958               gsi_from = gsi;
8959               gsi_prev (&gsi);
8960               lhs = gimple_get_lhs (stmt1);
8961               if (!lhs)
8962                 break;
8963
8964               /* LHS of vectorized stmt must be SSA_NAME.  */
8965               if (TREE_CODE (lhs) != SSA_NAME)
8966                 break;
8967
8968               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8969                 {
8970                   /* Remove dead scalar statement.  */
8971                   if (has_zero_uses (lhs))
8972                     {
8973                       gsi_remove (&gsi_from, true);
8974                       continue;
8975                     }
8976                 }
8977
8978               /* Check that LHS does not have uses outside of STORE_BB.  */
8979               res = true;
8980               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
8981                 {
8982                   gimple *use_stmt;
8983                   use_stmt = USE_STMT (use_p);
8984                   if (is_gimple_debug (use_stmt))
8985                     continue;
8986                   if (gimple_bb (use_stmt) != store_bb)
8987                     {
8988                       res = false;
8989                       break;
8990                     }
8991                 }
8992               if (!res)
8993                 break;
8994
8995               if (gimple_vuse (stmt1)
8996                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
8997                 break;
8998
8999               /* Can move STMT1 to STORE_BB.  */
9000               if (dump_enabled_p ())
9001                 dump_printf_loc (MSG_NOTE, vect_location,
9002                                  "Move stmt to created bb\n%G", stmt1);
9003               gsi_move_before (&gsi_from, &gsi_to);
9004               /* Shift GSI_TO for further insertion.  */
9005               gsi_prev (&gsi_to);
9006             }
9007           /* Put other masked stores with the same mask to STORE_BB.  */
9008           if (worklist.is_empty ()
9009               || gimple_call_arg (worklist.last (), 2) != mask
9010               || worklist.last () != stmt1)
9011             break;
9012           last = worklist.pop ();
9013         }
9014       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
9015     }
9016 }