gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2019 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "cfghooks.h"
  31 #include "tree-pass.h"
  32 #include "ssa.h"
  33 #include "optabs-tree.h"
  34 #include "diagnostic-core.h"
  35 #include "fold-const.h"
  36 #include "stor-layout.h"
  37 #include "cfganal.h"
  38 #include "gimplify.h"
  39 #include "gimple-iterator.h"
  40 #include "gimplify-me.h"
  41 #include "tree-ssa-loop-ivopts.h"
  42 #include "tree-ssa-loop-manip.h"
  43 #include "tree-ssa-loop-niter.h"
  44 #include "tree-ssa-loop.h"
  45 #include "cfgloop.h"
  46 #include "params.h"
  47 #include "tree-scalar-evolution.h"
  48 #include "tree-vectorizer.h"
  49 #include "gimple-fold.h"
  50 #include "cgraph.h"
  51 #include "tree-cfg.h"
  52 #include "tree-if-conv.h"
  53 #include "internal-fn.h"
  54 #include "tree-vector-builder.h"
  55 #include "vec-perm-indices.h"
  56 #include "tree-eh.h"
  57
  58 /* Loop Vectorization Pass.
  59
  60    This pass tries to vectorize loops.
  61
  62    For example, the vectorizer transforms the following simple loop:
  63
  64         short a[N]; short b[N]; short c[N]; int i;
  65
  66         for (i=0; i<N; i++){
  67           a[i] = b[i] + c[i];
  68         }
  69
  70    as if it was manually vectorized by rewriting the source code into:
  71
  72         typedef int __attribute__((mode(V8HI))) v8hi;
  73         short a[N];  short b[N]; short c[N];   int i;
  74         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  75         v8hi va, vb, vc;
  76
  77         for (i=0; i<N/8; i++){
  78           vb = pb[i];
  79           vc = pc[i];
  80           va = vb + vc;
  81           pa[i] = va;
  82         }
  83
  84         The main entry to this pass is vectorize_loops(), in which
  85    the vectorizer applies a set of analyses on a given set of loops,
  86    followed by the actual vectorization transformation for the loops that
  87    had successfully passed the analysis phase.
  88         Throughout this pass we make a distinction between two types of
  89    data: scalars (which are represented by SSA_NAMES), and memory references
  90    ("data-refs").  These two types of data require different handling both
  91    during analysis and transformation. The types of data-refs that the
  92    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  93    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  94    accesses are required to have a simple (consecutive) access pattern.
  95
  96    Analysis phase:
  97    ===============
  98         The driver for the analysis phase is vect_analyze_loop().
  99    It applies a set of analyses, some of which rely on the scalar evolution
 100    analyzer (scev) developed by Sebastian Pop.
 101
 102         During the analysis phase the vectorizer records some information
 103    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 104    loop, as well as general information about the loop as a whole, which is
 105    recorded in a "loop_vec_info" struct attached to each loop.
 106
 107    Transformation phase:
 108    =====================
 109         The loop transformation phase scans all the stmts in the loop, and
 110    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 111    the loop that needs to be vectorized.  It inserts the vector code sequence
 112    just before the scalar stmt S, and records a pointer to the vector code
 113    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 114    attached to S).  This pointer will be used for the vectorization of following
 115    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 116    otherwise, we rely on dead code elimination for removing it.
 117
 118         For example, say stmt S1 was vectorized into stmt VS1:
 119
 120    VS1: vb = px[i];
 121    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 122    S2:  a = b;
 123
 124    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 125    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 126    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 127    resulting sequence would be:
 128
 129    VS1: vb = px[i];
 130    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 131    VS2: va = vb;
 132    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 133
 134         Operands that are not SSA_NAMEs, are data-refs that appear in
 135    load/store operations (like 'x[i]' in S1), and are handled differently.
 136
 137    Target modeling:
 138    =================
 139         Currently the only target specific information that is used is the
 140    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 141    Targets that can support different sizes of vectors, for now will need
 142    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 143    flexibility will be added in the future.
 144
 145         Since we only vectorize operations which vector form can be
 146    expressed using existing tree codes, to verify that an operation is
 147    supported, the vectorizer checks the relevant optab at the relevant
 148    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 149    the value found is CODE_FOR_nothing, then there's no target support, and
 150    we can't vectorize the stmt.
 151
 152    For additional information on this project see:
 153    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 154 */
 155
 156 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 157
 158 /* Subroutine of vect_determine_vf_for_stmt that handles only one
 159    statement.  VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
 160    may already be set for general statements (not just data refs).  */
 161
 162 static opt_result
 163 vect_determine_vf_for_stmt_1 (stmt_vec_info stmt_info,
 164                               bool vectype_maybe_set_p,
 165                               poly_uint64 *vf,
 166                               vec<stmt_vec_info > *mask_producers)
 167 {
 168   gimple *stmt = stmt_info->stmt;
 169
 170   if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 171        && !STMT_VINFO_LIVE_P (stmt_info))
 172       || gimple_clobber_p (stmt))
 173     {
 174       if (dump_enabled_p ())
 175         dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 176       return opt_result::success ();
 177     }
 178
 179   tree stmt_vectype, nunits_vectype;
 180   opt_result res = vect_get_vector_types_for_stmt (stmt_info, &stmt_vectype,
 181                                                    &nunits_vectype);
 182   if (!res)
 183     return res;
 184
 185   if (stmt_vectype)
 186     {
 187       if (STMT_VINFO_VECTYPE (stmt_info))
 188         /* The only case when a vectype had been already set is for stmts
 189            that contain a data ref, or for "pattern-stmts" (stmts generated
 190            by the vectorizer to represent/replace a certain idiom).  */
 191         gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
 192                      || vectype_maybe_set_p)
 193                     && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
 194       else if (stmt_vectype == boolean_type_node)
 195         mask_producers->safe_push (stmt_info);
 196       else
 197         STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
 198     }
 199
 200   if (nunits_vectype)
 201     vect_update_max_nunits (vf, nunits_vectype);
 202
 203   return opt_result::success ();
 204 }
 205
 206 /* Subroutine of vect_determine_vectorization_factor.  Set the vector
 207    types of STMT_INFO and all attached pattern statements and update
 208    the vectorization factor VF accordingly.  If some of the statements
 209    produce a mask result whose vector type can only be calculated later,
 210    add them to MASK_PRODUCERS.  Return true on success or false if
 211    something prevented vectorization.  */
 212
 213 static opt_result
 214 vect_determine_vf_for_stmt (stmt_vec_info stmt_info, poly_uint64 *vf,
 215                             vec<stmt_vec_info > *mask_producers)
 216 {
 217   vec_info *vinfo = stmt_info->vinfo;
 218   if (dump_enabled_p ())
 219     dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
 220                      stmt_info->stmt);
 221   opt_result res
 222     = vect_determine_vf_for_stmt_1 (stmt_info, false, vf, mask_producers);
 223   if (!res)
 224     return res;
 225
 226   if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 227       && STMT_VINFO_RELATED_STMT (stmt_info))
 228     {
 229       gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 230       stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
 231
 232       /* If a pattern statement has def stmts, analyze them too.  */
 233       for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
 234            !gsi_end_p (si); gsi_next (&si))
 235         {
 236           stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
 237           if (dump_enabled_p ())
 238             dump_printf_loc (MSG_NOTE, vect_location,
 239                              "==> examining pattern def stmt: %G",
 240                              def_stmt_info->stmt);
 241           if (!vect_determine_vf_for_stmt_1 (def_stmt_info, true,
 242                                              vf, mask_producers))
 243           res = vect_determine_vf_for_stmt_1 (def_stmt_info, true,
 244                                               vf, mask_producers);
 245           if (!res)
 246             return res;
 247         }
 248
 249       if (dump_enabled_p ())
 250         dump_printf_loc (MSG_NOTE, vect_location,
 251                          "==> examining pattern statement: %G",
 252                          stmt_info->stmt);
 253       res = vect_determine_vf_for_stmt_1 (stmt_info, true, vf, mask_producers);
 254       if (!res)
 255         return res;
 256     }
 257
 258   return opt_result::success ();
 259 }
 260
 261 /* Function vect_determine_vectorization_factor
 262
 263    Determine the vectorization factor (VF).  VF is the number of data elements
 264    that are operated upon in parallel in a single iteration of the vectorized
 265    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 266    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 267    elements can fit in a single vector register.
 268
 269    We currently support vectorization of loops in which all types operated upon
 270    are of the same size.  Therefore this function currently sets VF according to
 271    the size of the types operated upon, and fails if there are multiple sizes
 272    in the loop.
 273
 274    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 275    original loop:
 276         for (i=0; i<N; i++){
 277           a[i] = b[i] + c[i];
 278         }
 279
 280    vectorized loop:
 281         for (i=0; i<N; i+=VF){
 282           a[i:VF] = b[i:VF] + c[i:VF];
 283         }
 284 */
 285
 286 static opt_result
 287 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 288 {
 289   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 290   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 291   unsigned nbbs = loop->num_nodes;
 292   poly_uint64 vectorization_factor = 1;
 293   tree scalar_type = NULL_TREE;
 294   gphi *phi;
 295   tree vectype;
 296   stmt_vec_info stmt_info;
 297   unsigned i;
 298   auto_vec<stmt_vec_info> mask_producers;
 299
 300   DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
 301
 302   for (i = 0; i < nbbs; i++)
 303     {
 304       basic_block bb = bbs[i];
 305
 306       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 307            gsi_next (&si))
 308         {
 309           phi = si.phi ();
 310           stmt_info = loop_vinfo->lookup_stmt (phi);
 311           if (dump_enabled_p ())
 312             dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
 313                              phi);
 314
 315           gcc_assert (stmt_info);
 316
 317           if (STMT_VINFO_RELEVANT_P (stmt_info)
 318               || STMT_VINFO_LIVE_P (stmt_info))
 319             {
 320               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 321               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 322
 323               if (dump_enabled_p ())
 324                 dump_printf_loc (MSG_NOTE, vect_location,
 325                                  "get vectype for scalar type:  %T\n",
 326                                  scalar_type);
 327
 328               vectype = get_vectype_for_scalar_type (scalar_type);
 329               if (!vectype)
 330                 return opt_result::failure_at (phi,
 331                                                "not vectorized: unsupported "
 332                                                "data-type %T\n",
 333                                                scalar_type);
 334               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 335
 336               if (dump_enabled_p ())
 337                 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
 338                                  vectype);
 339
 340               if (dump_enabled_p ())
 341                 {
 342                   dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
 343                   dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
 344                   dump_printf (MSG_NOTE, "\n");
 345                 }
 346
 347               vect_update_max_nunits (&vectorization_factor, vectype);
 348             }
 349         }
 350
 351       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
 352            gsi_next (&si))
 353         {
 354           stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
 355           opt_result res
 356             = vect_determine_vf_for_stmt (stmt_info, &vectorization_factor,
 357                                           &mask_producers);
 358           if (!res)
 359             return res;
 360         }
 361     }
 362
 363   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 364   if (dump_enabled_p ())
 365     {
 366       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
 367       dump_dec (MSG_NOTE, vectorization_factor);
 368       dump_printf (MSG_NOTE, "\n");
 369     }
 370
 371   if (known_le (vectorization_factor, 1U))
 372     return opt_result::failure_at (vect_location,
 373                                    "not vectorized: unsupported data-type\n");
 374   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 375
 376   for (i = 0; i < mask_producers.length (); i++)
 377     {
 378       stmt_info = mask_producers[i];
 379       opt_tree mask_type = vect_get_mask_type_for_stmt (stmt_info);
 380       if (!mask_type)
 381         return opt_result::propagate_failure (mask_type);
 382       STMT_VINFO_VECTYPE (stmt_info) = mask_type;
 383     }
 384
 385   return opt_result::success ();
 386 }
 387
 388
 389 /* Function vect_is_simple_iv_evolution.
 390
 391    FORNOW: A simple evolution of an induction variables in the loop is
 392    considered a polynomial evolution.  */
 393
 394 static bool
 395 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 396                              tree * step)
 397 {
 398   tree init_expr;
 399   tree step_expr;
 400   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 401   basic_block bb;
 402
 403   /* When there is no evolution in this loop, the evolution function
 404      is not "simple".  */
 405   if (evolution_part == NULL_TREE)
 406     return false;
 407
 408   /* When the evolution is a polynomial of degree >= 2
 409      the evolution function is not "simple".  */
 410   if (tree_is_chrec (evolution_part))
 411     return false;
 412
 413   step_expr = evolution_part;
 414   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 415
 416   if (dump_enabled_p ())
 417     dump_printf_loc (MSG_NOTE, vect_location, "step: %T,  init: %T\n",
 418                      step_expr, init_expr);
 419
 420   *init = init_expr;
 421   *step = step_expr;
 422
 423   if (TREE_CODE (step_expr) != INTEGER_CST
 424       && (TREE_CODE (step_expr) != SSA_NAME
 425           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 426               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 427           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 428               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 429                   || !flag_associative_math)))
 430       && (TREE_CODE (step_expr) != REAL_CST
 431           || !flag_associative_math))
 432     {
 433       if (dump_enabled_p ())
 434         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 435                          "step unknown.\n");
 436       return false;
 437     }
 438
 439   return true;
 440 }
 441
 442 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
 443    what we are assuming is a double reduction.  For example, given
 444    a structure like this:
 445
 446       outer1:
 447         x_1 = PHI <x_4(outer2), ...>;
 448         ...
 449
 450       inner:
 451         x_2 = PHI <x_1(outer1), ...>;
 452         ...
 453         x_3 = ...;
 454         ...
 455
 456       outer2:
 457         x_4 = PHI <x_3(inner)>;
 458         ...
 459
 460    outer loop analysis would treat x_1 as a double reduction phi and
 461    this function would then return true for x_2.  */
 462
 463 static bool
 464 vect_inner_phi_in_double_reduction_p (stmt_vec_info stmt_info, gphi *phi)
 465 {
 466   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
 467   use_operand_p use_p;
 468   ssa_op_iter op_iter;
 469   FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
 470     if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
 471       if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
 472         return true;
 473   return false;
 474 }
 475
 476 /* Function vect_analyze_scalar_cycles_1.
 477
 478    Examine the cross iteration def-use cycles of scalar variables
 479    in LOOP.  LOOP_VINFO represents the loop that is now being
 480    considered for vectorization (can be LOOP, or an outer-loop
 481    enclosing LOOP).  */
 482
 483 static void
 484 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 485 {
 486   basic_block bb = loop->header;
 487   tree init, step;
 488   auto_vec<stmt_vec_info, 64> worklist;
 489   gphi_iterator gsi;
 490   bool double_reduc;
 491
 492   DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
 493
 494   /* First - identify all inductions.  Reduction detection assumes that all the
 495      inductions have been identified, therefore, this order must not be
 496      changed.  */
 497   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 498     {
 499       gphi *phi = gsi.phi ();
 500       tree access_fn = NULL;
 501       tree def = PHI_RESULT (phi);
 502       stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
 503
 504       if (dump_enabled_p ())
 505         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
 506
 507       /* Skip virtual phi's.  The data dependences that are associated with
 508          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 509       if (virtual_operand_p (def))
 510         continue;
 511
 512       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 513
 514       /* Analyze the evolution function.  */
 515       access_fn = analyze_scalar_evolution (loop, def);
 516       if (access_fn)
 517         {
 518           STRIP_NOPS (access_fn);
 519           if (dump_enabled_p ())
 520             dump_printf_loc (MSG_NOTE, vect_location,
 521                              "Access function of PHI: %T\n", access_fn);
 522           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 523             = initial_condition_in_loop_num (access_fn, loop->num);
 524           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 525             = evolution_part_in_loop_num (access_fn, loop->num);
 526         }
 527
 528       if (!access_fn
 529           || vect_inner_phi_in_double_reduction_p (stmt_vinfo, phi)
 530           || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
 531           || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 532               && TREE_CODE (step) != INTEGER_CST))
 533         {
 534           worklist.safe_push (stmt_vinfo);
 535           continue;
 536         }
 537
 538       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 539                   != NULL_TREE);
 540       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 541
 542       if (dump_enabled_p ())
 543         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 544       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 545     }
 546
 547
 548   /* Second - identify all reductions and nested cycles.  */
 549   while (worklist.length () > 0)
 550     {
 551       stmt_vec_info stmt_vinfo = worklist.pop ();
 552       gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
 553       tree def = PHI_RESULT (phi);
 554
 555       if (dump_enabled_p ())
 556         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
 557
 558       gcc_assert (!virtual_operand_p (def)
 559                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 560
 561       stmt_vec_info reduc_stmt_info
 562         = vect_force_simple_reduction (loop_vinfo, stmt_vinfo,
 563                                        &double_reduc, false);
 564       if (reduc_stmt_info)
 565         {
 566           if (double_reduc)
 567             {
 568               if (dump_enabled_p ())
 569                 dump_printf_loc (MSG_NOTE, vect_location,
 570                                  "Detected double reduction.\n");
 571
 572               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 573               STMT_VINFO_DEF_TYPE (reduc_stmt_info)
 574                 = vect_double_reduction_def;
 575             }
 576           else
 577             {
 578               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
 579                 {
 580                   if (dump_enabled_p ())
 581                     dump_printf_loc (MSG_NOTE, vect_location,
 582                                      "Detected vectorizable nested cycle.\n");
 583
 584                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 585                   STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_nested_cycle;
 586                 }
 587               else
 588                 {
 589                   if (dump_enabled_p ())
 590                     dump_printf_loc (MSG_NOTE, vect_location,
 591                                      "Detected reduction.\n");
 592
 593                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 594                   STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
 595                   /* Store the reduction cycles for possible vectorization in
 596                      loop-aware SLP if it was not detected as reduction
 597                      chain.  */
 598                   if (! REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
 599                     LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
 600                       (reduc_stmt_info);
 601                 }
 602             }
 603         }
 604       else
 605         if (dump_enabled_p ())
 606           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 607                            "Unknown def-use cycle pattern.\n");
 608     }
 609 }
 610
 611
 612 /* Function vect_analyze_scalar_cycles.
 613
 614    Examine the cross iteration def-use cycles of scalar variables, by
 615    analyzing the loop-header PHIs of scalar variables.  Classify each
 616    cycle as one of the following: invariant, induction, reduction, unknown.
 617    We do that for the loop represented by LOOP_VINFO, and also to its
 618    inner-loop, if exists.
 619    Examples for scalar cycles:
 620
 621    Example1: reduction:
 622
 623               loop1:
 624               for (i=0; i<N; i++)
 625                  sum += a[i];
 626
 627    Example2: induction:
 628
 629               loop2:
 630               for (i=0; i<N; i++)
 631                  a[i] = i;  */
 632
 633 static void
 634 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 635 {
 636   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 637
 638   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 639
 640   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 641      Reductions in such inner-loop therefore have different properties than
 642      the reductions in the nest that gets vectorized:
 643      1. When vectorized, they are executed in the same order as in the original
 644         scalar loop, so we can't change the order of computation when
 645         vectorizing them.
 646      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 647         current checks are too strict.  */
 648
 649   if (loop->inner)
 650     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 651 }
 652
 653 /* Transfer group and reduction information from STMT_INFO to its
 654    pattern stmt.  */
 655
 656 static void
 657 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
 658 {
 659   stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
 660   stmt_vec_info stmtp;
 661   gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
 662               && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
 663   REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
 664   do
 665     {
 666       stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
 667       REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
 668       stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
 669       if (stmt_info)
 670         REDUC_GROUP_NEXT_ELEMENT (stmtp)
 671           = STMT_VINFO_RELATED_STMT (stmt_info);
 672     }
 673   while (stmt_info);
 674   STMT_VINFO_DEF_TYPE (stmtp) = vect_reduction_def;
 675 }
 676
 677 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 678
 679 static void
 680 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 681 {
 682   stmt_vec_info first;
 683   unsigned i;
 684
 685   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 686     if (STMT_VINFO_IN_PATTERN_P (first))
 687       {
 688         stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
 689         while (next)
 690           {
 691             if (! STMT_VINFO_IN_PATTERN_P (next))
 692               break;
 693             next = REDUC_GROUP_NEXT_ELEMENT (next);
 694           }
 695         /* If not all stmt in the chain are patterns try to handle
 696            the chain without patterns.  */
 697         if (! next)
 698           {
 699             vect_fixup_reduc_chain (first);
 700             LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
 701               = STMT_VINFO_RELATED_STMT (first);
 702           }
 703       }
 704 }
 705
 706 /* Function vect_get_loop_niters.
 707
 708    Determine how many iterations the loop is executed and place it
 709    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
 710    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
 711    niter information holds in ASSUMPTIONS.
 712
 713    Return the loop exit condition.  */
 714
 715
 716 static gcond *
 717 vect_get_loop_niters (struct loop *loop, tree *assumptions,
 718                       tree *number_of_iterations, tree *number_of_iterationsm1)
 719 {
 720   edge exit = single_exit (loop);
 721   struct tree_niter_desc niter_desc;
 722   tree niter_assumptions, niter, may_be_zero;
 723   gcond *cond = get_loop_exit_condition (loop);
 724
 725   *assumptions = boolean_true_node;
 726   *number_of_iterationsm1 = chrec_dont_know;
 727   *number_of_iterations = chrec_dont_know;
 728   DUMP_VECT_SCOPE ("get_loop_niters");
 729
 730   if (!exit)
 731     return cond;
 732
 733   may_be_zero = NULL_TREE;
 734   if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
 735       || chrec_contains_undetermined (niter_desc.niter))
 736     return cond;
 737
 738   niter_assumptions = niter_desc.assumptions;
 739   may_be_zero = niter_desc.may_be_zero;
 740   niter = niter_desc.niter;
 741
 742   if (may_be_zero && integer_zerop (may_be_zero))
 743     may_be_zero = NULL_TREE;
 744
 745   if (may_be_zero)
 746     {
 747       if (COMPARISON_CLASS_P (may_be_zero))
 748         {
 749           /* Try to combine may_be_zero with assumptions, this can simplify
 750              computation of niter expression.  */
 751           if (niter_assumptions && !integer_nonzerop (niter_assumptions))
 752             niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
 753                                              niter_assumptions,
 754                                              fold_build1 (TRUTH_NOT_EXPR,
 755                                                           boolean_type_node,
 756                                                           may_be_zero));
 757           else
 758             niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
 759                                  build_int_cst (TREE_TYPE (niter), 0),
 760                                  rewrite_to_non_trapping_overflow (niter));
 761
 762           may_be_zero = NULL_TREE;
 763         }
 764       else if (integer_nonzerop (may_be_zero))
 765         {
 766           *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
 767           *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
 768           return cond;
 769         }
 770       else
 771         return cond;
 772     }
 773
 774   *assumptions = niter_assumptions;
 775   *number_of_iterationsm1 = niter;
 776
 777   /* We want the number of loop header executions which is the number
 778      of latch executions plus one.
 779      ???  For UINT_MAX latch executions this number overflows to zero
 780      for loops like do { n++; } while (n != 0);  */
 781   if (niter && !chrec_contains_undetermined (niter))
 782     niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
 783                           build_int_cst (TREE_TYPE (niter), 1));
 784   *number_of_iterations = niter;
 785
 786   return cond;
 787 }
 788
 789 /* Function bb_in_loop_p
 790
 791    Used as predicate for dfs order traversal of the loop bbs.  */
 792
 793 static bool
 794 bb_in_loop_p (const_basic_block bb, const void *data)
 795 {
 796   const struct loop *const loop = (const struct loop *)data;
 797   if (flow_bb_inside_loop_p (loop, bb))
 798     return true;
 799   return false;
 800 }
 801
 802
 803 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
 804    stmt_vec_info structs for all the stmts in LOOP_IN.  */
 805
 806 _loop_vec_info::_loop_vec_info (struct loop *loop_in, vec_info_shared *shared)
 807   : vec_info (vec_info::loop, init_cost (loop_in), shared),
 808     loop (loop_in),
 809     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
 810     num_itersm1 (NULL_TREE),
 811     num_iters (NULL_TREE),
 812     num_iters_unchanged (NULL_TREE),
 813     num_iters_assumptions (NULL_TREE),
 814     th (0),
 815     versioning_threshold (0),
 816     vectorization_factor (0),
 817     max_vectorization_factor (0),
 818     mask_skip_niters (NULL_TREE),
 819     mask_compare_type (NULL_TREE),
 820     simd_if_cond (NULL_TREE),
 821     unaligned_dr (NULL),
 822     peeling_for_alignment (0),
 823     ptr_mask (0),
 824     ivexpr_map (NULL),
 825     scan_map (NULL),
 826     slp_unrolling_factor (1),
 827     single_scalar_iteration_cost (0),
 828     vectorizable (false),
 829     can_fully_mask_p (true),
 830     fully_masked_p (false),
 831     peeling_for_gaps (false),
 832     peeling_for_niter (false),
 833     operands_swapped (false),
 834     no_data_dependencies (false),
 835     has_mask_store (false),
 836     scalar_loop_scaling (profile_probability::uninitialized ()),
 837     scalar_loop (NULL),
 838     orig_loop_info (NULL)
 839 {
 840   /* CHECKME: We want to visit all BBs before their successors (except for
 841      latch blocks, for which this assertion wouldn't hold).  In the simple
 842      case of the loop forms we allow, a dfs order of the BBs would the same
 843      as reversed postorder traversal, so we are safe.  */
 844
 845   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 846                                           bbs, loop->num_nodes, loop);
 847   gcc_assert (nbbs == loop->num_nodes);
 848
 849   for (unsigned int i = 0; i < nbbs; i++)
 850     {
 851       basic_block bb = bbs[i];
 852       gimple_stmt_iterator si;
 853
 854       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 855         {
 856           gimple *phi = gsi_stmt (si);
 857           gimple_set_uid (phi, 0);
 858           add_stmt (phi);
 859         }
 860
 861       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 862         {
 863           gimple *stmt = gsi_stmt (si);
 864           gimple_set_uid (stmt, 0);
 865           add_stmt (stmt);
 866           /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
 867              third argument is the #pragma omp simd if (x) condition, when 0,
 868              loop shouldn't be vectorized, when non-zero constant, it should
 869              be vectorized normally, otherwise versioned with vectorized loop
 870              done if the condition is non-zero at runtime.  */
 871           if (loop_in->simduid
 872               && is_gimple_call (stmt)
 873               && gimple_call_internal_p (stmt)
 874               && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
 875               && gimple_call_num_args (stmt) >= 3
 876               && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
 877               && (loop_in->simduid
 878                   == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
 879             {
 880               tree arg = gimple_call_arg (stmt, 2);
 881               if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
 882                 simd_if_cond = arg;
 883               else
 884                 gcc_assert (integer_nonzerop (arg));
 885             }
 886         }
 887     }
 888 }
 889
 890 /* Free all levels of MASKS.  */
 891
 892 void
 893 release_vec_loop_masks (vec_loop_masks *masks)
 894 {
 895   rgroup_masks *rgm;
 896   unsigned int i;
 897   FOR_EACH_VEC_ELT (*masks, i, rgm)
 898     rgm->masks.release ();
 899   masks->release ();
 900 }
 901
 902 /* Free all memory used by the _loop_vec_info, as well as all the
 903    stmt_vec_info structs of all the stmts in the loop.  */
 904
 905 _loop_vec_info::~_loop_vec_info ()
 906 {
 907   int nbbs;
 908   gimple_stmt_iterator si;
 909   int j;
 910
 911   nbbs = loop->num_nodes;
 912   for (j = 0; j < nbbs; j++)
 913     {
 914       basic_block bb = bbs[j];
 915       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
 916         {
 917           gimple *stmt = gsi_stmt (si);
 918
 919           /* We may have broken canonical form by moving a constant
 920              into RHS1 of a commutative op.  Fix such occurrences.  */
 921           if (operands_swapped && is_gimple_assign (stmt))
 922             {
 923               enum tree_code code = gimple_assign_rhs_code (stmt);
 924
 925               if ((code == PLUS_EXPR
 926                    || code == POINTER_PLUS_EXPR
 927                    || code == MULT_EXPR)
 928                   && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
 929                 swap_ssa_operands (stmt,
 930                                    gimple_assign_rhs1_ptr (stmt),
 931                                    gimple_assign_rhs2_ptr (stmt));
 932               else if (code == COND_EXPR
 933                        && CONSTANT_CLASS_P (gimple_assign_rhs2 (stmt)))
 934                 {
 935                   tree cond_expr = gimple_assign_rhs1 (stmt);
 936                   enum tree_code cond_code = TREE_CODE (cond_expr);
 937
 938                   if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
 939                     {
 940                       bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr,
 941                                                                   0));
 942                       cond_code = invert_tree_comparison (cond_code,
 943                                                           honor_nans);
 944                       if (cond_code != ERROR_MARK)
 945                         {
 946                           TREE_SET_CODE (cond_expr, cond_code);
 947                           swap_ssa_operands (stmt,
 948                                              gimple_assign_rhs2_ptr (stmt),
 949                                              gimple_assign_rhs3_ptr (stmt));
 950                         }
 951                     }
 952                 }
 953             }
 954           gsi_next (&si);
 955         }
 956     }
 957
 958   free (bbs);
 959
 960   release_vec_loop_masks (&masks);
 961   delete ivexpr_map;
 962   delete scan_map;
 963
 964   loop->aux = NULL;
 965 }
 966
 967 /* Return an invariant or register for EXPR and emit necessary
 968    computations in the LOOP_VINFO loop preheader.  */
 969
 970 tree
 971 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
 972 {
 973   if (is_gimple_reg (expr)
 974       || is_gimple_min_invariant (expr))
 975     return expr;
 976
 977   if (! loop_vinfo->ivexpr_map)
 978     loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
 979   tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
 980   if (! cached)
 981     {
 982       gimple_seq stmts = NULL;
 983       cached = force_gimple_operand (unshare_expr (expr),
 984                                      &stmts, true, NULL_TREE);
 985       if (stmts)
 986         {
 987           edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
 988           gsi_insert_seq_on_edge_immediate (e, stmts);
 989         }
 990     }
 991   return cached;
 992 }
 993
 994 /* Return true if we can use CMP_TYPE as the comparison type to produce
 995    all masks required to mask LOOP_VINFO.  */
 996
 997 static bool
 998 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
 999 {
1000   rgroup_masks *rgm;
1001   unsigned int i;
1002   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1003     if (rgm->mask_type != NULL_TREE
1004         && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
1005                                             cmp_type, rgm->mask_type,
1006                                             OPTIMIZE_FOR_SPEED))
1007       return false;
1008   return true;
1009 }
1010
1011 /* Calculate the maximum number of scalars per iteration for every
1012    rgroup in LOOP_VINFO.  */
1013
1014 static unsigned int
1015 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1016 {
1017   unsigned int res = 1;
1018   unsigned int i;
1019   rgroup_masks *rgm;
1020   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1021     res = MAX (res, rgm->max_nscalars_per_iter);
1022   return res;
1023 }
1024
1025 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1026    whether we can actually generate the masks required.  Return true if so,
1027    storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE.  */
1028
1029 static bool
1030 vect_verify_full_masking (loop_vec_info loop_vinfo)
1031 {
1032   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1033   unsigned int min_ni_width;
1034   unsigned int max_nscalars_per_iter
1035     = vect_get_max_nscalars_per_iter (loop_vinfo);
1036
1037   /* Use a normal loop if there are no statements that need masking.
1038      This only happens in rare degenerate cases: it means that the loop
1039      has no loads, no stores, and no live-out values.  */
1040   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1041     return false;
1042
1043   /* Get the maximum number of iterations that is representable
1044      in the counter type.  */
1045   tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1046   widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1047
1048   /* Get a more refined estimate for the number of iterations.  */
1049   widest_int max_back_edges;
1050   if (max_loop_iterations (loop, &max_back_edges))
1051     max_ni = wi::smin (max_ni, max_back_edges + 1);
1052
1053   /* Account for rgroup masks, in which each bit is replicated N times.  */
1054   max_ni *= max_nscalars_per_iter;
1055
1056   /* Work out how many bits we need to represent the limit.  */
1057   min_ni_width = wi::min_precision (max_ni, UNSIGNED);
1058
1059   /* Find a scalar mode for which WHILE_ULT is supported.  */
1060   opt_scalar_int_mode cmp_mode_iter;
1061   tree cmp_type = NULL_TREE;
1062   tree iv_type = NULL_TREE;
1063   widest_int iv_limit = vect_iv_limit_for_full_masking (loop_vinfo);
1064   unsigned int iv_precision = UINT_MAX;
1065
1066   if (iv_limit != -1)
1067     iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1068                                       UNSIGNED);
1069
1070   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1071     {
1072       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1073       if (cmp_bits >= min_ni_width
1074           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1075         {
1076           tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1077           if (this_type
1078               && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1079             {
1080               /* Although we could stop as soon as we find a valid mode,
1081                  there are at least two reasons why that's not always the
1082                  best choice:
1083
1084                  - An IV that's Pmode or wider is more likely to be reusable
1085                    in address calculations than an IV that's narrower than
1086                    Pmode.
1087
1088                  - Doing the comparison in IV_PRECISION or wider allows
1089                    a natural 0-based IV, whereas using a narrower comparison
1090                    type requires mitigations against wrap-around.
1091
1092                  Conversely, if the IV limit is variable, doing the comparison
1093                  in a wider type than the original type can introduce
1094                  unnecessary extensions, so picking the widest valid mode
1095                  is not always a good choice either.
1096
1097                  Here we prefer the first IV type that's Pmode or wider,
1098                  and the first comparison type that's IV_PRECISION or wider.
1099                  (The comparison type must be no wider than the IV type,
1100                  to avoid extensions in the vector loop.)
1101
1102                  ??? We might want to try continuing beyond Pmode for ILP32
1103                  targets if CMP_BITS < IV_PRECISION.  */
1104               iv_type = this_type;
1105               if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1106                 cmp_type = this_type;
1107               if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1108                 break;
1109             }
1110         }
1111     }
1112
1113   if (!cmp_type)
1114     return false;
1115
1116   LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
1117   LOOP_VINFO_MASK_IV_TYPE (loop_vinfo) = iv_type;
1118   return true;
1119 }
1120
1121 /* Calculate the cost of one scalar iteration of the loop.  */
1122 static void
1123 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1124 {
1125   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1126   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1127   int nbbs = loop->num_nodes, factor;
1128   int innerloop_iters, i;
1129
1130   DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1131
1132   /* Gather costs for statements in the scalar loop.  */
1133
1134   /* FORNOW.  */
1135   innerloop_iters = 1;
1136   if (loop->inner)
1137     innerloop_iters = 50; /* FIXME */
1138
1139   for (i = 0; i < nbbs; i++)
1140     {
1141       gimple_stmt_iterator si;
1142       basic_block bb = bbs[i];
1143
1144       if (bb->loop_father == loop->inner)
1145         factor = innerloop_iters;
1146       else
1147         factor = 1;
1148
1149       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1150         {
1151           gimple *stmt = gsi_stmt (si);
1152           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1153
1154           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1155             continue;
1156
1157           /* Skip stmts that are not vectorized inside the loop.  */
1158           stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1159           if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1160               && (!STMT_VINFO_LIVE_P (vstmt_info)
1161                   || !VECTORIZABLE_CYCLE_DEF
1162                         (STMT_VINFO_DEF_TYPE (vstmt_info))))
1163             continue;
1164
1165           vect_cost_for_stmt kind;
1166           if (STMT_VINFO_DATA_REF (stmt_info))
1167             {
1168               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1169                kind = scalar_load;
1170              else
1171                kind = scalar_store;
1172             }
1173           else
1174             kind = scalar_stmt;
1175
1176           record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1177                             factor, kind, stmt_info, 0, vect_prologue);
1178         }
1179     }
1180
1181   /* Now accumulate cost.  */
1182   void *target_cost_data = init_cost (loop);
1183   stmt_info_for_cost *si;
1184   int j;
1185   FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1186                     j, si)
1187     (void) add_stmt_cost (target_cost_data, si->count,
1188                           si->kind, si->stmt_info, si->misalign,
1189                           vect_body);
1190   unsigned dummy, body_cost = 0;
1191   finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
1192   destroy_cost_data (target_cost_data);
1193   LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
1194 }
1195
1196
1197 /* Function vect_analyze_loop_form_1.
1198
1199    Verify that certain CFG restrictions hold, including:
1200    - the loop has a pre-header
1201    - the loop has a single entry and exit
1202    - the loop exit condition is simple enough
1203    - the number of iterations can be analyzed, i.e, a countable loop.  The
1204      niter could be analyzed under some assumptions.  */
1205
1206 opt_result
1207 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1208                           tree *assumptions, tree *number_of_iterationsm1,
1209                           tree *number_of_iterations, gcond **inner_loop_cond)
1210 {
1211   DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1212
1213   /* Different restrictions apply when we are considering an inner-most loop,
1214      vs. an outer (nested) loop.
1215      (FORNOW. May want to relax some of these restrictions in the future).  */
1216
1217   if (!loop->inner)
1218     {
1219       /* Inner-most loop.  We currently require that the number of BBs is
1220          exactly 2 (the header and latch).  Vectorizable inner-most loops
1221          look like this:
1222
1223                         (pre-header)
1224                            |
1225                           header <--------+
1226                            | |            |
1227                            | +--> latch --+
1228                            |
1229                         (exit-bb)  */
1230
1231       if (loop->num_nodes != 2)
1232         return opt_result::failure_at (vect_location,
1233                                        "not vectorized:"
1234                                        " control flow in loop.\n");
1235
1236       if (empty_block_p (loop->header))
1237         return opt_result::failure_at (vect_location,
1238                                        "not vectorized: empty loop.\n");
1239     }
1240   else
1241     {
1242       struct loop *innerloop = loop->inner;
1243       edge entryedge;
1244
1245       /* Nested loop. We currently require that the loop is doubly-nested,
1246          contains a single inner loop, and the number of BBs is exactly 5.
1247          Vectorizable outer-loops look like this:
1248
1249                         (pre-header)
1250                            |
1251                           header <---+
1252                            |         |
1253                           inner-loop |
1254                            |         |
1255                           tail ------+
1256                            |
1257                         (exit-bb)
1258
1259          The inner-loop has the properties expected of inner-most loops
1260          as described above.  */
1261
1262       if ((loop->inner)->inner || (loop->inner)->next)
1263         return opt_result::failure_at (vect_location,
1264                                        "not vectorized:"
1265                                        " multiple nested loops.\n");
1266
1267       if (loop->num_nodes != 5)
1268         return opt_result::failure_at (vect_location,
1269                                        "not vectorized:"
1270                                        " control flow in loop.\n");
1271
1272       entryedge = loop_preheader_edge (innerloop);
1273       if (entryedge->src != loop->header
1274           || !single_exit (innerloop)
1275           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1276         return opt_result::failure_at (vect_location,
1277                                        "not vectorized:"
1278                                        " unsupported outerloop form.\n");
1279
1280       /* Analyze the inner-loop.  */
1281       tree inner_niterm1, inner_niter, inner_assumptions;
1282       opt_result res
1283         = vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1284                                     &inner_assumptions, &inner_niterm1,
1285                                     &inner_niter, NULL);
1286       if (!res)
1287         {
1288           if (dump_enabled_p ())
1289             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1290                              "not vectorized: Bad inner loop.\n");
1291           return res;
1292         }
1293
1294       /* Don't support analyzing niter under assumptions for inner
1295          loop.  */
1296       if (!integer_onep (inner_assumptions))
1297         return opt_result::failure_at (vect_location,
1298                                        "not vectorized: Bad inner loop.\n");
1299
1300       if (!expr_invariant_in_loop_p (loop, inner_niter))
1301         return opt_result::failure_at (vect_location,
1302                                        "not vectorized: inner-loop count not"
1303                                        " invariant.\n");
1304
1305       if (dump_enabled_p ())
1306         dump_printf_loc (MSG_NOTE, vect_location,
1307                          "Considering outer-loop vectorization.\n");
1308     }
1309
1310   if (!single_exit (loop))
1311     return opt_result::failure_at (vect_location,
1312                                    "not vectorized: multiple exits.\n");
1313   if (EDGE_COUNT (loop->header->preds) != 2)
1314     return opt_result::failure_at (vect_location,
1315                                    "not vectorized:"
1316                                    " too many incoming edges.\n");
1317
1318   /* We assume that the loop exit condition is at the end of the loop. i.e,
1319      that the loop is represented as a do-while (with a proper if-guard
1320      before the loop if needed), where the loop header contains all the
1321      executable statements, and the latch is empty.  */
1322   if (!empty_block_p (loop->latch)
1323       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1324     return opt_result::failure_at (vect_location,
1325                                    "not vectorized: latch block not empty.\n");
1326
1327   /* Make sure the exit is not abnormal.  */
1328   edge e = single_exit (loop);
1329   if (e->flags & EDGE_ABNORMAL)
1330     return opt_result::failure_at (vect_location,
1331                                    "not vectorized:"
1332                                    " abnormal loop exit edge.\n");
1333
1334   *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1335                                      number_of_iterationsm1);
1336   if (!*loop_cond)
1337     return opt_result::failure_at
1338       (vect_location,
1339        "not vectorized: complicated exit condition.\n");
1340
1341   if (integer_zerop (*assumptions)
1342       || !*number_of_iterations
1343       || chrec_contains_undetermined (*number_of_iterations))
1344     return opt_result::failure_at
1345       (*loop_cond,
1346        "not vectorized: number of iterations cannot be computed.\n");
1347
1348   if (integer_zerop (*number_of_iterations))
1349     return opt_result::failure_at
1350       (*loop_cond,
1351        "not vectorized: number of iterations = 0.\n");
1352
1353   return opt_result::success ();
1354 }
1355
1356 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form.  */
1357
1358 opt_loop_vec_info
1359 vect_analyze_loop_form (struct loop *loop, vec_info_shared *shared)
1360 {
1361   tree assumptions, number_of_iterations, number_of_iterationsm1;
1362   gcond *loop_cond, *inner_loop_cond = NULL;
1363
1364   opt_result res
1365     = vect_analyze_loop_form_1 (loop, &loop_cond,
1366                                 &assumptions, &number_of_iterationsm1,
1367                                 &number_of_iterations, &inner_loop_cond);
1368   if (!res)
1369     return opt_loop_vec_info::propagate_failure (res);
1370
1371   loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1372   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1373   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1374   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1375   if (!integer_onep (assumptions))
1376     {
1377       /* We consider to vectorize this loop by versioning it under
1378          some assumptions.  In order to do this, we need to clear
1379          existing information computed by scev and niter analyzer.  */
1380       scev_reset_htab ();
1381       free_numbers_of_iterations_estimates (loop);
1382       /* Also set flag for this loop so that following scev and niter
1383          analysis are done under the assumptions.  */
1384       loop_constraint_set (loop, LOOP_C_FINITE);
1385       /* Also record the assumptions for versioning.  */
1386       LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1387     }
1388
1389   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1390     {
1391       if (dump_enabled_p ())
1392         {
1393           dump_printf_loc (MSG_NOTE, vect_location,
1394                            "Symbolic number of iterations is ");
1395           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1396           dump_printf (MSG_NOTE, "\n");
1397         }
1398     }
1399
1400   stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (loop_cond);
1401   STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1402   if (inner_loop_cond)
1403     {
1404       stmt_vec_info inner_loop_cond_info
1405         = loop_vinfo->lookup_stmt (inner_loop_cond);
1406       STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1407     }
1408
1409   gcc_assert (!loop->aux);
1410   loop->aux = loop_vinfo;
1411   return opt_loop_vec_info::success (loop_vinfo);
1412 }
1413
1414
1415
1416 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1417    statements update the vectorization factor.  */
1418
1419 static void
1420 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1421 {
1422   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1423   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1424   int nbbs = loop->num_nodes;
1425   poly_uint64 vectorization_factor;
1426   int i;
1427
1428   DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1429
1430   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1431   gcc_assert (known_ne (vectorization_factor, 0U));
1432
1433   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1434      vectorization factor of the loop is the unrolling factor required by
1435      the SLP instances.  If that unrolling factor is 1, we say, that we
1436      perform pure SLP on loop - cross iteration parallelism is not
1437      exploited.  */
1438   bool only_slp_in_loop = true;
1439   for (i = 0; i < nbbs; i++)
1440     {
1441       basic_block bb = bbs[i];
1442       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1443            gsi_next (&si))
1444         {
1445           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1446           stmt_info = vect_stmt_to_vectorize (stmt_info);
1447           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1448                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1449               && !PURE_SLP_STMT (stmt_info))
1450             /* STMT needs both SLP and loop-based vectorization.  */
1451             only_slp_in_loop = false;
1452         }
1453     }
1454
1455   if (only_slp_in_loop)
1456     {
1457       if (dump_enabled_p ())
1458         dump_printf_loc (MSG_NOTE, vect_location,
1459                          "Loop contains only SLP stmts\n");
1460       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1461     }
1462   else
1463     {
1464       if (dump_enabled_p ())
1465         dump_printf_loc (MSG_NOTE, vect_location,
1466                          "Loop contains SLP and non-SLP stmts\n");
1467       /* Both the vectorization factor and unroll factor have the form
1468          current_vector_size * X for some rational X, so they must have
1469          a common multiple.  */
1470       vectorization_factor
1471         = force_common_multiple (vectorization_factor,
1472                                  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1473     }
1474
1475   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1476   if (dump_enabled_p ())
1477     {
1478       dump_printf_loc (MSG_NOTE, vect_location,
1479                        "Updating vectorization factor to ");
1480       dump_dec (MSG_NOTE, vectorization_factor);
1481       dump_printf (MSG_NOTE, ".\n");
1482     }
1483 }
1484
1485 /* Return true if STMT_INFO describes a double reduction phi and if
1486    the other phi in the reduction is also relevant for vectorization.
1487    This rejects cases such as:
1488
1489       outer1:
1490         x_1 = PHI <x_3(outer2), ...>;
1491         ...
1492
1493       inner:
1494         x_2 = ...;
1495         ...
1496
1497       outer2:
1498         x_3 = PHI <x_2(inner)>;
1499
1500    if nothing in x_2 or elsewhere makes x_1 relevant.  */
1501
1502 static bool
1503 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1504 {
1505   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1506     return false;
1507
1508   return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1509 }
1510
1511 /* Function vect_analyze_loop_operations.
1512
1513    Scan the loop stmts and make sure they are all vectorizable.  */
1514
1515 static opt_result
1516 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1517 {
1518   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1519   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1520   int nbbs = loop->num_nodes;
1521   int i;
1522   stmt_vec_info stmt_info;
1523   bool need_to_vectorize = false;
1524   bool ok;
1525
1526   DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1527
1528   auto_vec<stmt_info_for_cost> cost_vec;
1529
1530   for (i = 0; i < nbbs; i++)
1531     {
1532       basic_block bb = bbs[i];
1533
1534       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1535            gsi_next (&si))
1536         {
1537           gphi *phi = si.phi ();
1538           ok = true;
1539
1540           stmt_info = loop_vinfo->lookup_stmt (phi);
1541           if (dump_enabled_p ())
1542             dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G", phi);
1543           if (virtual_operand_p (gimple_phi_result (phi)))
1544             continue;
1545
1546           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1547              (i.e., a phi in the tail of the outer-loop).  */
1548           if (! is_loop_header_bb_p (bb))
1549             {
1550               /* FORNOW: we currently don't support the case that these phis
1551                  are not used in the outerloop (unless it is double reduction,
1552                  i.e., this phi is vect_reduction_def), cause this case
1553                  requires to actually do something here.  */
1554               if (STMT_VINFO_LIVE_P (stmt_info)
1555                   && !vect_active_double_reduction_p (stmt_info))
1556                 return opt_result::failure_at (phi,
1557                                                "Unsupported loop-closed phi"
1558                                                " in outer-loop.\n");
1559
1560               /* If PHI is used in the outer loop, we check that its operand
1561                  is defined in the inner loop.  */
1562               if (STMT_VINFO_RELEVANT_P (stmt_info))
1563                 {
1564                   tree phi_op;
1565
1566                   if (gimple_phi_num_args (phi) != 1)
1567                     return opt_result::failure_at (phi, "unsupported phi");
1568
1569                   phi_op = PHI_ARG_DEF (phi, 0);
1570                   stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1571                   if (!op_def_info)
1572                     return opt_result::failure_at (phi, "unsupported phi");
1573
1574                   if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1575                       && (STMT_VINFO_RELEVANT (op_def_info)
1576                           != vect_used_in_outer_by_reduction))
1577                     return opt_result::failure_at (phi, "unsupported phi");
1578                 }
1579
1580               continue;
1581             }
1582
1583           gcc_assert (stmt_info);
1584
1585           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1586                || STMT_VINFO_LIVE_P (stmt_info))
1587               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1588             /* A scalar-dependence cycle that we don't support.  */
1589             return opt_result::failure_at (phi,
1590                                            "not vectorized:"
1591                                            " scalar dependence cycle.\n");
1592
1593           if (STMT_VINFO_RELEVANT_P (stmt_info))
1594             {
1595               need_to_vectorize = true;
1596               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1597                   && ! PURE_SLP_STMT (stmt_info))
1598                 ok = vectorizable_induction (stmt_info, NULL, NULL, NULL,
1599                                              &cost_vec);
1600               else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1601                         || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1602                        && ! PURE_SLP_STMT (stmt_info))
1603                 ok = vectorizable_reduction (stmt_info, NULL, NULL, NULL, NULL,
1604                                              &cost_vec);
1605             }
1606
1607           /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
1608           if (ok
1609               && STMT_VINFO_LIVE_P (stmt_info)
1610               && !PURE_SLP_STMT (stmt_info))
1611             ok = vectorizable_live_operation (stmt_info, NULL, NULL, -1, NULL,
1612                                               &cost_vec);
1613
1614           if (!ok)
1615             return opt_result::failure_at (phi,
1616                                            "not vectorized: relevant phi not "
1617                                            "supported: %G",
1618                                            static_cast <gimple *> (phi));
1619         }
1620
1621       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1622            gsi_next (&si))
1623         {
1624           gimple *stmt = gsi_stmt (si);
1625           if (!gimple_clobber_p (stmt))
1626             {
1627               opt_result res
1628                 = vect_analyze_stmt (loop_vinfo->lookup_stmt (stmt),
1629                                      &need_to_vectorize,
1630                                      NULL, NULL, &cost_vec);
1631               if (!res)
1632                 return res;
1633             }
1634         }
1635     } /* bbs */
1636
1637   add_stmt_costs (loop_vinfo->target_cost_data, &cost_vec);
1638
1639   /* All operations in the loop are either irrelevant (deal with loop
1640      control, or dead), or only used outside the loop and can be moved
1641      out of the loop (e.g. invariants, inductions).  The loop can be
1642      optimized away by scalar optimizations.  We're better off not
1643      touching this loop.  */
1644   if (!need_to_vectorize)
1645     {
1646       if (dump_enabled_p ())
1647         dump_printf_loc (MSG_NOTE, vect_location,
1648                          "All the computation can be taken out of the loop.\n");
1649       return opt_result::failure_at
1650         (vect_location,
1651          "not vectorized: redundant loop. no profit to vectorize.\n");
1652     }
1653
1654   return opt_result::success ();
1655 }
1656
1657 /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
1658    is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
1659    definitely no, or -1 if it's worth retrying.  */
1660
1661 static int
1662 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1663 {
1664   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1665   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1666
1667   /* Only fully-masked loops can have iteration counts less than the
1668      vectorization factor.  */
1669   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1670     {
1671       HOST_WIDE_INT max_niter;
1672
1673       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1674         max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1675       else
1676         max_niter = max_stmt_executions_int (loop);
1677
1678       if (max_niter != -1
1679           && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1680         {
1681           if (dump_enabled_p ())
1682             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1683                              "not vectorized: iteration count smaller than "
1684                              "vectorization factor.\n");
1685           return 0;
1686         }
1687     }
1688
1689   int min_profitable_iters, min_profitable_estimate;
1690   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1691                                       &min_profitable_estimate);
1692
1693   if (min_profitable_iters < 0)
1694     {
1695       if (dump_enabled_p ())
1696         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1697                          "not vectorized: vectorization not profitable.\n");
1698       if (dump_enabled_p ())
1699         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1700                          "not vectorized: vector version will never be "
1701                          "profitable.\n");
1702       return -1;
1703     }
1704
1705   int min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1706                                * assumed_vf);
1707
1708   /* Use the cost model only if it is more conservative than user specified
1709      threshold.  */
1710   unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1711                                     min_profitable_iters);
1712
1713   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1714
1715   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1716       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1717     {
1718       if (dump_enabled_p ())
1719         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1720                          "not vectorized: vectorization not profitable.\n");
1721       if (dump_enabled_p ())
1722         dump_printf_loc (MSG_NOTE, vect_location,
1723                          "not vectorized: iteration count smaller than user "
1724                          "specified loop bound parameter or minimum profitable "
1725                          "iterations (whichever is more conservative).\n");
1726       return 0;
1727     }
1728
1729   HOST_WIDE_INT estimated_niter = estimated_stmt_executions_int (loop);
1730   if (estimated_niter == -1)
1731     estimated_niter = likely_max_stmt_executions_int (loop);
1732   if (estimated_niter != -1
1733       && ((unsigned HOST_WIDE_INT) estimated_niter
1734           < MAX (th, (unsigned) min_profitable_estimate)))
1735     {
1736       if (dump_enabled_p ())
1737         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1738                          "not vectorized: estimated iteration count too "
1739                          "small.\n");
1740       if (dump_enabled_p ())
1741         dump_printf_loc (MSG_NOTE, vect_location,
1742                          "not vectorized: estimated iteration count smaller "
1743                          "than specified loop bound parameter or minimum "
1744                          "profitable iterations (whichever is more "
1745                          "conservative).\n");
1746       return -1;
1747     }
1748
1749   return 1;
1750 }
1751
1752 static opt_result
1753 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1754                            vec<data_reference_p> *datarefs,
1755                            unsigned int *n_stmts)
1756 {
1757   *n_stmts = 0;
1758   for (unsigned i = 0; i < loop->num_nodes; i++)
1759     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1760          !gsi_end_p (gsi); gsi_next (&gsi))
1761       {
1762         gimple *stmt = gsi_stmt (gsi);
1763         if (is_gimple_debug (stmt))
1764           continue;
1765         ++(*n_stmts);
1766         opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs);
1767         if (!res)
1768           {
1769             if (is_gimple_call (stmt) && loop->safelen)
1770               {
1771                 tree fndecl = gimple_call_fndecl (stmt), op;
1772                 if (fndecl != NULL_TREE)
1773                   {
1774                     cgraph_node *node = cgraph_node::get (fndecl);
1775                     if (node != NULL && node->simd_clones != NULL)
1776                       {
1777                         unsigned int j, n = gimple_call_num_args (stmt);
1778                         for (j = 0; j < n; j++)
1779                           {
1780                             op = gimple_call_arg (stmt, j);
1781                             if (DECL_P (op)
1782                                 || (REFERENCE_CLASS_P (op)
1783                                     && get_base_address (op)))
1784                               break;
1785                           }
1786                         op = gimple_call_lhs (stmt);
1787                         /* Ignore #pragma omp declare simd functions
1788                            if they don't have data references in the
1789                            call stmt itself.  */
1790                         if (j == n
1791                             && !(op
1792                                  && (DECL_P (op)
1793                                      || (REFERENCE_CLASS_P (op)
1794                                          && get_base_address (op)))))
1795                           continue;
1796                       }
1797                   }
1798               }
1799             return res;
1800           }
1801         /* If dependence analysis will give up due to the limit on the
1802            number of datarefs stop here and fail fatally.  */
1803         if (datarefs->length ()
1804             > (unsigned)PARAM_VALUE (PARAM_LOOP_MAX_DATAREFS_FOR_DATADEPS))
1805           return opt_result::failure_at (stmt, "exceeded param "
1806                                          "loop-max-datarefs-for-datadeps\n");
1807       }
1808   return opt_result::success ();
1809 }
1810
1811 /* Look for SLP-only access groups and turn each individual access into its own
1812    group.  */
1813 static void
1814 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
1815 {
1816   unsigned int i;
1817   struct data_reference *dr;
1818
1819   DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
1820
1821   vec<data_reference_p> datarefs = loop_vinfo->shared->datarefs;
1822   FOR_EACH_VEC_ELT (datarefs, i, dr)
1823     {
1824       gcc_assert (DR_REF (dr));
1825       stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
1826
1827       /* Check if the load is a part of an interleaving chain.  */
1828       if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1829         {
1830           stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
1831           unsigned int group_size = DR_GROUP_SIZE (first_element);
1832
1833           /* Check if SLP-only groups.  */
1834           if (!STMT_SLP_TYPE (stmt_info)
1835               && STMT_VINFO_SLP_VECT_ONLY (first_element))
1836             {
1837               /* Dissolve the group.  */
1838               STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
1839
1840               stmt_vec_info vinfo = first_element;
1841               while (vinfo)
1842                 {
1843                   stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
1844                   DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
1845                   DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
1846                   DR_GROUP_SIZE (vinfo) = 1;
1847                   DR_GROUP_GAP (vinfo) = group_size - 1;
1848                   vinfo = next;
1849                 }
1850             }
1851         }
1852     }
1853 }
1854
1855 /* Function vect_analyze_loop_2.
1856
1857    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1858    for it.  The different analyses will record information in the
1859    loop_vec_info struct.  */
1860 static opt_result
1861 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
1862 {
1863   opt_result ok = opt_result::success ();
1864   int res;
1865   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
1866   poly_uint64 min_vf = 2;
1867
1868   /* The first group of checks is independent of the vector size.  */
1869   fatal = true;
1870
1871   if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
1872       && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
1873     return opt_result::failure_at (vect_location,
1874                                    "not vectorized: simd if(0)\n");
1875
1876   /* Find all data references in the loop (which correspond to vdefs/vuses)
1877      and analyze their evolution in the loop.  */
1878
1879   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1880
1881   /* Gather the data references and count stmts in the loop.  */
1882   if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
1883     {
1884       opt_result res
1885         = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
1886                                      &LOOP_VINFO_DATAREFS (loop_vinfo),
1887                                      n_stmts);
1888       if (!res)
1889         {
1890           if (dump_enabled_p ())
1891             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1892                              "not vectorized: loop contains function "
1893                              "calls or data references that cannot "
1894                              "be analyzed\n");
1895           return res;
1896         }
1897       loop_vinfo->shared->save_datarefs ();
1898     }
1899   else
1900     loop_vinfo->shared->check_datarefs ();
1901
1902   /* Analyze the data references and also adjust the minimal
1903      vectorization factor according to the loads and stores.  */
1904
1905   ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
1906   if (!ok)
1907     {
1908       if (dump_enabled_p ())
1909         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1910                          "bad data references.\n");
1911       return ok;
1912     }
1913
1914   /* Classify all cross-iteration scalar data-flow cycles.
1915      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1916   vect_analyze_scalar_cycles (loop_vinfo);
1917
1918   vect_pattern_recog (loop_vinfo);
1919
1920   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1921
1922   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1923      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1924
1925   ok = vect_analyze_data_ref_accesses (loop_vinfo);
1926   if (!ok)
1927     {
1928       if (dump_enabled_p ())
1929         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1930                          "bad data access.\n");
1931       return ok;
1932     }
1933
1934   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1935
1936   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
1937   if (!ok)
1938     {
1939       if (dump_enabled_p ())
1940         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1941                          "unexpected pattern.\n");
1942       return ok;
1943     }
1944
1945   /* While the rest of the analysis below depends on it in some way.  */
1946   fatal = false;
1947
1948   /* Analyze data dependences between the data-refs in the loop
1949      and adjust the maximum vectorization factor according to
1950      the dependences.
1951      FORNOW: fail at the first data dependence that we encounter.  */
1952
1953   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1954   if (!ok)
1955     {
1956       if (dump_enabled_p ())
1957         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1958                          "bad data dependence.\n");
1959       return ok;
1960     }
1961   if (max_vf != MAX_VECTORIZATION_FACTOR
1962       && maybe_lt (max_vf, min_vf))
1963     return opt_result::failure_at (vect_location, "bad data dependence.\n");
1964   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
1965
1966   ok = vect_determine_vectorization_factor (loop_vinfo);
1967   if (!ok)
1968     {
1969       if (dump_enabled_p ())
1970         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1971                          "can't determine vectorization factor.\n");
1972       return ok;
1973     }
1974   if (max_vf != MAX_VECTORIZATION_FACTOR
1975       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1976     return opt_result::failure_at (vect_location, "bad data dependence.\n");
1977
1978   /* Compute the scalar iteration cost.  */
1979   vect_compute_single_scalar_iteration_cost (loop_vinfo);
1980
1981   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1982   unsigned th;
1983
1984   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
1985   ok = vect_analyze_slp (loop_vinfo, *n_stmts);
1986   if (!ok)
1987     return ok;
1988
1989   /* If there are any SLP instances mark them as pure_slp.  */
1990   bool slp = vect_make_slp_decision (loop_vinfo);
1991   if (slp)
1992     {
1993       /* Find stmts that need to be both vectorized and SLPed.  */
1994       vect_detect_hybrid_slp (loop_vinfo);
1995
1996       /* Update the vectorization factor based on the SLP decision.  */
1997       vect_update_vf_for_slp (loop_vinfo);
1998     }
1999
2000   bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo);
2001
2002   /* We don't expect to have to roll back to anything other than an empty
2003      set of rgroups.  */
2004   gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2005
2006   /* This is the point where we can re-start analysis with SLP forced off.  */
2007 start_over:
2008
2009   /* Now the vectorization factor is final.  */
2010   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2011   gcc_assert (known_ne (vectorization_factor, 0U));
2012
2013   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2014     {
2015       dump_printf_loc (MSG_NOTE, vect_location,
2016                        "vectorization_factor = ");
2017       dump_dec (MSG_NOTE, vectorization_factor);
2018       dump_printf (MSG_NOTE, ", niters = %wd\n",
2019                    LOOP_VINFO_INT_NITERS (loop_vinfo));
2020     }
2021
2022   HOST_WIDE_INT max_niter
2023     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2024
2025   /* Analyze the alignment of the data-refs in the loop.
2026      Fail if a data reference is found that cannot be vectorized.  */
2027
2028   ok = vect_analyze_data_refs_alignment (loop_vinfo);
2029   if (!ok)
2030     {
2031       if (dump_enabled_p ())
2032         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2033                          "bad data alignment.\n");
2034       return ok;
2035     }
2036
2037   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2038      It is important to call pruning after vect_analyze_data_ref_accesses,
2039      since we use grouping information gathered by interleaving analysis.  */
2040   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2041   if (!ok)
2042     return ok;
2043
2044   /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2045      vectorization, since we do not want to add extra peeling or
2046      add versioning for alignment.  */
2047   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2048     /* This pass will decide on using loop versioning and/or loop peeling in
2049        order to enhance the alignment of data references in the loop.  */
2050     ok = vect_enhance_data_refs_alignment (loop_vinfo);
2051   else
2052     ok = vect_verify_datarefs_alignment (loop_vinfo);
2053   if (!ok)
2054     return ok;
2055
2056   if (slp)
2057     {
2058       /* Analyze operations in the SLP instances.  Note this may
2059          remove unsupported SLP instances which makes the above
2060          SLP kind detection invalid.  */
2061       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2062       vect_slp_analyze_operations (loop_vinfo);
2063       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2064         {
2065           ok = opt_result::failure_at (vect_location,
2066                                        "unsupported SLP instances\n");
2067           goto again;
2068         }
2069     }
2070
2071   /* Dissolve SLP-only groups.  */
2072   vect_dissolve_slp_only_groups (loop_vinfo);
2073
2074   /* Scan all the remaining operations in the loop that are not subject
2075      to SLP and make sure they are vectorizable.  */
2076   ok = vect_analyze_loop_operations (loop_vinfo);
2077   if (!ok)
2078     {
2079       if (dump_enabled_p ())
2080         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2081                          "bad operation or unsupported loop bound.\n");
2082       return ok;
2083     }
2084
2085   /* Decide whether to use a fully-masked loop for this vectorization
2086      factor.  */
2087   LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2088     = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
2089        && vect_verify_full_masking (loop_vinfo));
2090   if (dump_enabled_p ())
2091     {
2092       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2093         dump_printf_loc (MSG_NOTE, vect_location,
2094                          "using a fully-masked loop.\n");
2095       else
2096         dump_printf_loc (MSG_NOTE, vect_location,
2097                          "not using a fully-masked loop.\n");
2098     }
2099
2100   /* If epilog loop is required because of data accesses with gaps,
2101      one additional iteration needs to be peeled.  Check if there is
2102      enough iterations for vectorization.  */
2103   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2104       && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2105       && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2106     {
2107       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2108       tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2109
2110       if (known_lt (wi::to_widest (scalar_niters), vf))
2111         return opt_result::failure_at (vect_location,
2112                                        "loop has no enough iterations to"
2113                                        " support peeling for gaps.\n");
2114     }
2115
2116   /* Check the costings of the loop make vectorizing worthwhile.  */
2117   res = vect_analyze_loop_costing (loop_vinfo);
2118   if (res < 0)
2119     {
2120       ok = opt_result::failure_at (vect_location,
2121                                    "Loop costings may not be worthwhile.\n");
2122       goto again;
2123     }
2124   if (!res)
2125     return opt_result::failure_at (vect_location,
2126                                    "Loop costings not worthwhile.\n");
2127
2128   /* Decide whether we need to create an epilogue loop to handle
2129      remaining scalar iterations.  */
2130   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2131
2132   unsigned HOST_WIDE_INT const_vf;
2133   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2134     /* The main loop handles all iterations.  */
2135     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2136   else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2137            && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
2138     {
2139       /* Work out the (constant) number of iterations that need to be
2140          peeled for reasons other than niters.  */
2141       unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2142       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2143         peel_niter += 1;
2144       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
2145                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2146         LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2147     }
2148   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2149            /* ??? When peeling for gaps but not alignment, we could
2150               try to check whether the (variable) niters is known to be
2151               VF * N + 1.  That's something of a niche case though.  */
2152            || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2153            || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
2154            || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2155                 < (unsigned) exact_log2 (const_vf))
2156                /* In case of versioning, check if the maximum number of
2157                   iterations is greater than th.  If they are identical,
2158                   the epilogue is unnecessary.  */
2159                && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2160                    || ((unsigned HOST_WIDE_INT) max_niter
2161                        > (th / const_vf) * const_vf))))
2162     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2163
2164   /* If an epilogue loop is required make sure we can create one.  */
2165   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2166       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2167     {
2168       if (dump_enabled_p ())
2169         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2170       if (!vect_can_advance_ivs_p (loop_vinfo)
2171           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2172                                            single_exit (LOOP_VINFO_LOOP
2173                                                          (loop_vinfo))))
2174         {
2175           ok = opt_result::failure_at (vect_location,
2176                                        "not vectorized: can't create required "
2177                                        "epilog loop\n");
2178           goto again;
2179         }
2180     }
2181
2182   /* During peeling, we need to check if number of loop iterations is
2183      enough for both peeled prolog loop and vector loop.  This check
2184      can be merged along with threshold check of loop versioning, so
2185      increase threshold for this case if necessary.  */
2186   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
2187     {
2188       poly_uint64 niters_th = 0;
2189
2190       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2191         {
2192           /* Niters for peeled prolog loop.  */
2193           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2194             {
2195               dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2196               tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2197               niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2198             }
2199           else
2200             niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2201         }
2202
2203       /* Niters for at least one iteration of vectorized loop.  */
2204       if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2205         niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2206       /* One additional iteration because of peeling for gap.  */
2207       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2208         niters_th += 1;
2209       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2210     }
2211
2212   gcc_assert (known_eq (vectorization_factor,
2213                         LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2214
2215   /* Ok to vectorize!  */
2216   return opt_result::success ();
2217
2218 again:
2219   /* Ensure that "ok" is false (with an opt_problem if dumping is enabled).  */
2220   gcc_assert (!ok);
2221
2222   /* Try again with SLP forced off but if we didn't do any SLP there is
2223      no point in re-trying.  */
2224   if (!slp)
2225     return ok;
2226
2227   /* If there are reduction chains re-trying will fail anyway.  */
2228   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2229     return ok;
2230
2231   /* Likewise if the grouped loads or stores in the SLP cannot be handled
2232      via interleaving or lane instructions.  */
2233   slp_instance instance;
2234   slp_tree node;
2235   unsigned i, j;
2236   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2237     {
2238       stmt_vec_info vinfo;
2239       vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2240       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2241         continue;
2242       vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2243       unsigned int size = DR_GROUP_SIZE (vinfo);
2244       tree vectype = STMT_VINFO_VECTYPE (vinfo);
2245       if (! vect_store_lanes_supported (vectype, size, false)
2246          && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2247          && ! vect_grouped_store_supported (vectype, size))
2248         return opt_result::failure_at (vinfo->stmt,
2249                                        "unsupported grouped store\n");
2250       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2251         {
2252           vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
2253           vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2254           bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2255           size = DR_GROUP_SIZE (vinfo);
2256           vectype = STMT_VINFO_VECTYPE (vinfo);
2257           if (! vect_load_lanes_supported (vectype, size, false)
2258               && ! vect_grouped_load_supported (vectype, single_element_p,
2259                                                 size))
2260             return opt_result::failure_at (vinfo->stmt,
2261                                            "unsupported grouped load\n");
2262         }
2263     }
2264
2265   if (dump_enabled_p ())
2266     dump_printf_loc (MSG_NOTE, vect_location,
2267                      "re-trying with SLP disabled\n");
2268
2269   /* Roll back state appropriately.  No SLP this time.  */
2270   slp = false;
2271   /* Restore vectorization factor as it were without SLP.  */
2272   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2273   /* Free the SLP instances.  */
2274   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2275     vect_free_slp_instance (instance, false);
2276   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2277   /* Reset SLP type to loop_vect on all stmts.  */
2278   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2279     {
2280       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2281       for (gimple_stmt_iterator si = gsi_start_phis (bb);
2282            !gsi_end_p (si); gsi_next (&si))
2283         {
2284           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2285           STMT_SLP_TYPE (stmt_info) = loop_vect;
2286         }
2287       for (gimple_stmt_iterator si = gsi_start_bb (bb);
2288            !gsi_end_p (si); gsi_next (&si))
2289         {
2290           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2291           STMT_SLP_TYPE (stmt_info) = loop_vect;
2292           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2293             {
2294               gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2295               stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
2296               STMT_SLP_TYPE (stmt_info) = loop_vect;
2297               for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2298                    !gsi_end_p (pi); gsi_next (&pi))
2299                 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2300                   = loop_vect;
2301             }
2302         }
2303     }
2304   /* Free optimized alias test DDRS.  */
2305   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2306   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2307   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2308   /* Reset target cost data.  */
2309   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2310   LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2311     = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2312   /* Reset accumulated rgroup information.  */
2313   release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo));
2314   /* Reset assorted flags.  */
2315   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2316   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2317   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2318   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2319   LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p;
2320
2321   goto start_over;
2322 }
2323
2324 /* Function vect_analyze_loop.
2325
2326    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2327    for it.  The different analyses will record information in the
2328    loop_vec_info struct.  If ORIG_LOOP_VINFO is not NULL epilogue must
2329    be vectorized.  */
2330 opt_loop_vec_info
2331 vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo,
2332                    vec_info_shared *shared)
2333 {
2334   auto_vector_sizes vector_sizes;
2335
2336   /* Autodetect first vector size we try.  */
2337   current_vector_size = 0;
2338   targetm.vectorize.autovectorize_vector_sizes (&vector_sizes,
2339                                                 loop->simdlen != 0);
2340   unsigned int next_size = 0;
2341
2342   DUMP_VECT_SCOPE ("analyze_loop_nest");
2343
2344   if (loop_outer (loop)
2345       && loop_vec_info_for_loop (loop_outer (loop))
2346       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2347     return opt_loop_vec_info::failure_at (vect_location,
2348                                           "outer-loop already vectorized.\n");
2349
2350   if (!find_loop_nest (loop, &shared->loop_nest))
2351     return opt_loop_vec_info::failure_at
2352       (vect_location,
2353        "not vectorized: loop nest containing two or more consecutive inner"
2354        " loops cannot be vectorized\n");
2355
2356   unsigned n_stmts = 0;
2357   poly_uint64 autodetected_vector_size = 0;
2358   opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
2359   poly_uint64 first_vector_size = 0;
2360   while (1)
2361     {
2362       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
2363       opt_loop_vec_info loop_vinfo
2364         = vect_analyze_loop_form (loop, shared);
2365       if (!loop_vinfo)
2366         {
2367           if (dump_enabled_p ())
2368             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2369                              "bad loop form.\n");
2370           gcc_checking_assert (first_loop_vinfo == NULL);
2371           return loop_vinfo;
2372         }
2373
2374       bool fatal = false;
2375
2376       if (orig_loop_vinfo)
2377         LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo;
2378
2379       opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts);
2380       if (res)
2381         {
2382           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2383
2384           if (loop->simdlen
2385               && maybe_ne (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2386                            (unsigned HOST_WIDE_INT) loop->simdlen))
2387             {
2388               if (first_loop_vinfo == NULL)
2389                 {
2390                   first_loop_vinfo = loop_vinfo;
2391                   first_vector_size = current_vector_size;
2392                   loop->aux = NULL;
2393                 }
2394               else
2395                 delete loop_vinfo;
2396             }
2397           else
2398             {
2399               delete first_loop_vinfo;
2400               return loop_vinfo;
2401             }
2402         }
2403       else
2404         delete loop_vinfo;
2405
2406       if (next_size == 0)
2407         autodetected_vector_size = current_vector_size;
2408
2409       if (next_size < vector_sizes.length ()
2410           && known_eq (vector_sizes[next_size], autodetected_vector_size))
2411         next_size += 1;
2412
2413       if (fatal)
2414         {
2415           gcc_checking_assert (first_loop_vinfo == NULL);
2416           return opt_loop_vec_info::propagate_failure (res);
2417         }
2418
2419       if (next_size == vector_sizes.length ()
2420           || known_eq (current_vector_size, 0U))
2421         {
2422           if (first_loop_vinfo)
2423             {
2424               current_vector_size = first_vector_size;
2425               loop->aux = (loop_vec_info) first_loop_vinfo;
2426               if (dump_enabled_p ())
2427                 {
2428                   dump_printf_loc (MSG_NOTE, vect_location,
2429                                    "***** Choosing vector size ");
2430                   dump_dec (MSG_NOTE, current_vector_size);
2431                   dump_printf (MSG_NOTE, "\n");
2432                 }
2433               return first_loop_vinfo;
2434             }
2435           else
2436             return opt_loop_vec_info::propagate_failure (res);
2437         }
2438
2439       /* Try the next biggest vector size.  */
2440       current_vector_size = vector_sizes[next_size++];
2441       if (dump_enabled_p ())
2442         {
2443           dump_printf_loc (MSG_NOTE, vect_location,
2444                            "***** Re-trying analysis with "
2445                            "vector size ");
2446           dump_dec (MSG_NOTE, current_vector_size);
2447           dump_printf (MSG_NOTE, "\n");
2448         }
2449     }
2450 }
2451
2452 /* Return true if there is an in-order reduction function for CODE, storing
2453    it in *REDUC_FN if so.  */
2454
2455 static bool
2456 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
2457 {
2458   switch (code)
2459     {
2460     case PLUS_EXPR:
2461       *reduc_fn = IFN_FOLD_LEFT_PLUS;
2462       return true;
2463
2464     default:
2465       return false;
2466     }
2467 }
2468
2469 /* Function reduction_fn_for_scalar_code
2470
2471    Input:
2472    CODE - tree_code of a reduction operations.
2473
2474    Output:
2475    REDUC_FN - the corresponding internal function to be used to reduce the
2476       vector of partial results into a single scalar result, or IFN_LAST
2477       if the operation is a supported reduction operation, but does not have
2478       such an internal function.
2479
2480    Return FALSE if CODE currently cannot be vectorized as reduction.  */
2481
2482 static bool
2483 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
2484 {
2485   switch (code)
2486     {
2487       case MAX_EXPR:
2488         *reduc_fn = IFN_REDUC_MAX;
2489         return true;
2490
2491       case MIN_EXPR:
2492         *reduc_fn = IFN_REDUC_MIN;
2493         return true;
2494
2495       case PLUS_EXPR:
2496         *reduc_fn = IFN_REDUC_PLUS;
2497         return true;
2498
2499       case BIT_AND_EXPR:
2500         *reduc_fn = IFN_REDUC_AND;
2501         return true;
2502
2503       case BIT_IOR_EXPR:
2504         *reduc_fn = IFN_REDUC_IOR;
2505         return true;
2506
2507       case BIT_XOR_EXPR:
2508         *reduc_fn = IFN_REDUC_XOR;
2509         return true;
2510
2511       case MULT_EXPR:
2512       case MINUS_EXPR:
2513         *reduc_fn = IFN_LAST;
2514         return true;
2515
2516       default:
2517        return false;
2518     }
2519 }
2520
2521 /* If there is a neutral value X such that SLP reduction NODE would not
2522    be affected by the introduction of additional X elements, return that X,
2523    otherwise return null.  CODE is the code of the reduction.  REDUC_CHAIN
2524    is true if the SLP statements perform a single reduction, false if each
2525    statement performs an independent reduction.  */
2526
2527 static tree
2528 neutral_op_for_slp_reduction (slp_tree slp_node, tree_code code,
2529                               bool reduc_chain)
2530 {
2531   vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2532   stmt_vec_info stmt_vinfo = stmts[0];
2533   tree vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
2534   tree scalar_type = TREE_TYPE (vector_type);
2535   struct loop *loop = gimple_bb (stmt_vinfo->stmt)->loop_father;
2536   gcc_assert (loop);
2537
2538   switch (code)
2539     {
2540     case WIDEN_SUM_EXPR:
2541     case DOT_PROD_EXPR:
2542     case SAD_EXPR:
2543     case PLUS_EXPR:
2544     case MINUS_EXPR:
2545     case BIT_IOR_EXPR:
2546     case BIT_XOR_EXPR:
2547       return build_zero_cst (scalar_type);
2548
2549     case MULT_EXPR:
2550       return build_one_cst (scalar_type);
2551
2552     case BIT_AND_EXPR:
2553       return build_all_ones_cst (scalar_type);
2554
2555     case MAX_EXPR:
2556     case MIN_EXPR:
2557       /* For MIN/MAX the initial values are neutral.  A reduction chain
2558          has only a single initial value, so that value is neutral for
2559          all statements.  */
2560       if (reduc_chain)
2561         return PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt,
2562                                       loop_preheader_edge (loop));
2563       return NULL_TREE;
2564
2565     default:
2566       return NULL_TREE;
2567     }
2568 }
2569
2570 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
2571    STMT is printed with a message MSG. */
2572
2573 static void
2574 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2575 {
2576   dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
2577 }
2578
2579 /* DEF_STMT_INFO occurs in a loop that contains a potential reduction
2580    operation.  Return true if the results of DEF_STMT_INFO are something
2581    that can be accumulated by such a reduction.  */
2582
2583 static bool
2584 vect_valid_reduction_input_p (stmt_vec_info def_stmt_info)
2585 {
2586   return (is_gimple_assign (def_stmt_info->stmt)
2587           || is_gimple_call (def_stmt_info->stmt)
2588           || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_induction_def
2589           || (gimple_code (def_stmt_info->stmt) == GIMPLE_PHI
2590               && STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_internal_def
2591               && !is_loop_header_bb_p (gimple_bb (def_stmt_info->stmt))));
2592 }
2593
2594 /* Detect SLP reduction of the form:
2595
2596    #a1 = phi <a5, a0>
2597    a2 = operation (a1)
2598    a3 = operation (a2)
2599    a4 = operation (a3)
2600    a5 = operation (a4)
2601
2602    #a = phi <a5>
2603
2604    PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2605    FIRST_STMT is the first reduction stmt in the chain
2606    (a2 = operation (a1)).
2607
2608    Return TRUE if a reduction chain was detected.  */
2609
2610 static bool
2611 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2612                        gimple *first_stmt)
2613 {
2614   struct loop *loop = (gimple_bb (phi))->loop_father;
2615   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2616   enum tree_code code;
2617   gimple *loop_use_stmt = NULL;
2618   stmt_vec_info use_stmt_info;
2619   tree lhs;
2620   imm_use_iterator imm_iter;
2621   use_operand_p use_p;
2622   int nloop_uses, size = 0, n_out_of_loop_uses;
2623   bool found = false;
2624
2625   if (loop != vect_loop)
2626     return false;
2627
2628   auto_vec<stmt_vec_info, 8> reduc_chain;
2629   lhs = PHI_RESULT (phi);
2630   code = gimple_assign_rhs_code (first_stmt);
2631   while (1)
2632     {
2633       nloop_uses = 0;
2634       n_out_of_loop_uses = 0;
2635       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2636         {
2637           gimple *use_stmt = USE_STMT (use_p);
2638           if (is_gimple_debug (use_stmt))
2639             continue;
2640
2641           /* Check if we got back to the reduction phi.  */
2642           if (use_stmt == phi)
2643             {
2644               loop_use_stmt = use_stmt;
2645               found = true;
2646               break;
2647             }
2648
2649           if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2650             {
2651               loop_use_stmt = use_stmt;
2652               nloop_uses++;
2653             }
2654            else
2655              n_out_of_loop_uses++;
2656
2657            /* There are can be either a single use in the loop or two uses in
2658               phi nodes.  */
2659            if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2660              return false;
2661         }
2662
2663       if (found)
2664         break;
2665
2666       /* We reached a statement with no loop uses.  */
2667       if (nloop_uses == 0)
2668         return false;
2669
2670       /* This is a loop exit phi, and we haven't reached the reduction phi.  */
2671       if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2672         return false;
2673
2674       if (!is_gimple_assign (loop_use_stmt)
2675           || code != gimple_assign_rhs_code (loop_use_stmt)
2676           || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2677         return false;
2678
2679       /* Insert USE_STMT into reduction chain.  */
2680       use_stmt_info = loop_info->lookup_stmt (loop_use_stmt);
2681       reduc_chain.safe_push (use_stmt_info);
2682
2683       lhs = gimple_assign_lhs (loop_use_stmt);
2684       size++;
2685    }
2686
2687   if (!found || loop_use_stmt != phi || size < 2)
2688     return false;
2689
2690   /* Swap the operands, if needed, to make the reduction operand be the second
2691      operand.  */
2692   lhs = PHI_RESULT (phi);
2693   for (unsigned i = 0; i < reduc_chain.length (); ++i)
2694     {
2695       gassign *next_stmt = as_a <gassign *> (reduc_chain[i]->stmt);
2696       if (gimple_assign_rhs2 (next_stmt) == lhs)
2697         {
2698           tree op = gimple_assign_rhs1 (next_stmt);
2699           stmt_vec_info def_stmt_info = loop_info->lookup_def (op);
2700
2701           /* Check that the other def is either defined in the loop
2702              ("vect_internal_def"), or it's an induction (defined by a
2703              loop-header phi-node).  */
2704           if (def_stmt_info
2705               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt))
2706               && vect_valid_reduction_input_p (def_stmt_info))
2707             {
2708               lhs = gimple_assign_lhs (next_stmt);
2709               continue;
2710             }
2711
2712           return false;
2713         }
2714       else
2715         {
2716           tree op = gimple_assign_rhs2 (next_stmt);
2717           stmt_vec_info def_stmt_info = loop_info->lookup_def (op);
2718
2719           /* Check that the other def is either defined in the loop
2720             ("vect_internal_def"), or it's an induction (defined by a
2721             loop-header phi-node).  */
2722           if (def_stmt_info
2723               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt))
2724               && vect_valid_reduction_input_p (def_stmt_info))
2725             {
2726               if (dump_enabled_p ())
2727                 dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: %G",
2728                                  next_stmt);
2729
2730               swap_ssa_operands (next_stmt,
2731                                  gimple_assign_rhs1_ptr (next_stmt),
2732                                  gimple_assign_rhs2_ptr (next_stmt));
2733               update_stmt (next_stmt);
2734
2735               if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2736                 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2737             }
2738           else
2739             return false;
2740         }
2741
2742       lhs = gimple_assign_lhs (next_stmt);
2743     }
2744
2745   /* Build up the actual chain.  */
2746   for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
2747     {
2748       REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
2749       REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
2750     }
2751   REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
2752   REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
2753
2754   /* Save the chain for further analysis in SLP detection.  */
2755   LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
2756   REDUC_GROUP_SIZE (reduc_chain[0]) = size;
2757
2758   return true;
2759 }
2760
2761 /* Return true if we need an in-order reduction for operation CODE
2762    on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
2763    overflow must wrap.  */
2764
2765 static bool
2766 needs_fold_left_reduction_p (tree type, tree_code code,
2767                              bool need_wrapping_integral_overflow)
2768 {
2769   /* CHECKME: check for !flag_finite_math_only too?  */
2770   if (SCALAR_FLOAT_TYPE_P (type))
2771     switch (code)
2772       {
2773       case MIN_EXPR:
2774       case MAX_EXPR:
2775         return false;
2776
2777       default:
2778         return !flag_associative_math;
2779       }
2780
2781   if (INTEGRAL_TYPE_P (type))
2782     {
2783       if (!operation_no_trapping_overflow (type, code))
2784         return true;
2785       if (need_wrapping_integral_overflow
2786           && !TYPE_OVERFLOW_WRAPS (type)
2787           && operation_can_overflow (code))
2788         return true;
2789       return false;
2790     }
2791
2792   if (SAT_FIXED_POINT_TYPE_P (type))
2793     return true;
2794
2795   return false;
2796 }
2797
2798 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
2799    reduction operation CODE has a handled computation expression.  */
2800
2801 bool
2802 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
2803                       tree loop_arg, enum tree_code code)
2804 {
2805   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
2806   auto_bitmap visited;
2807   tree lookfor = PHI_RESULT (phi);
2808   ssa_op_iter curri;
2809   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
2810   while (USE_FROM_PTR (curr) != loop_arg)
2811     curr = op_iter_next_use (&curri);
2812   curri.i = curri.numops;
2813   do
2814     {
2815       path.safe_push (std::make_pair (curri, curr));
2816       tree use = USE_FROM_PTR (curr);
2817       if (use == lookfor)
2818         break;
2819       gimple *def = SSA_NAME_DEF_STMT (use);
2820       if (gimple_nop_p (def)
2821           || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
2822         {
2823 pop:
2824           do
2825             {
2826               std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
2827               curri = x.first;
2828               curr = x.second;
2829               do
2830                 curr = op_iter_next_use (&curri);
2831               /* Skip already visited or non-SSA operands (from iterating
2832                  over PHI args).  */
2833               while (curr != NULL_USE_OPERAND_P
2834                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2835                          || ! bitmap_set_bit (visited,
2836                                               SSA_NAME_VERSION
2837                                                 (USE_FROM_PTR (curr)))));
2838             }
2839           while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
2840           if (curr == NULL_USE_OPERAND_P)
2841             break;
2842         }
2843       else
2844         {
2845           if (gimple_code (def) == GIMPLE_PHI)
2846             curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
2847           else
2848             curr = op_iter_init_use (&curri, def, SSA_OP_USE);
2849           while (curr != NULL_USE_OPERAND_P
2850                  && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2851                      || ! bitmap_set_bit (visited,
2852                                           SSA_NAME_VERSION
2853                                             (USE_FROM_PTR (curr)))))
2854             curr = op_iter_next_use (&curri);
2855           if (curr == NULL_USE_OPERAND_P)
2856             goto pop;
2857         }
2858     }
2859   while (1);
2860   if (dump_file && (dump_flags & TDF_DETAILS))
2861     {
2862       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
2863       unsigned i;
2864       std::pair<ssa_op_iter, use_operand_p> *x;
2865       FOR_EACH_VEC_ELT (path, i, x)
2866         dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
2867       dump_printf (MSG_NOTE, "\n");
2868     }
2869
2870   /* Check whether the reduction path detected is valid.  */
2871   bool fail = path.length () == 0;
2872   bool neg = false;
2873   for (unsigned i = 1; i < path.length (); ++i)
2874     {
2875       gimple *use_stmt = USE_STMT (path[i].second);
2876       tree op = USE_FROM_PTR (path[i].second);
2877       if (! has_single_use (op)
2878           || ! is_gimple_assign (use_stmt))
2879         {
2880           fail = true;
2881           break;
2882         }
2883       if (gimple_assign_rhs_code (use_stmt) != code)
2884         {
2885           if (code == PLUS_EXPR
2886               && gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
2887             {
2888               /* Track whether we negate the reduction value each iteration.  */
2889               if (gimple_assign_rhs2 (use_stmt) == op)
2890                 neg = ! neg;
2891             }
2892           else
2893             {
2894               fail = true;
2895               break;
2896             }
2897         }
2898     }
2899   return ! fail && ! neg;
2900 }
2901
2902
2903 /* Function vect_is_simple_reduction
2904
2905    (1) Detect a cross-iteration def-use cycle that represents a simple
2906    reduction computation.  We look for the following pattern:
2907
2908    loop_header:
2909      a1 = phi < a0, a2 >
2910      a3 = ...
2911      a2 = operation (a3, a1)
2912
2913    or
2914
2915    a3 = ...
2916    loop_header:
2917      a1 = phi < a0, a2 >
2918      a2 = operation (a3, a1)
2919
2920    such that:
2921    1. operation is commutative and associative and it is safe to
2922       change the order of the computation
2923    2. no uses for a2 in the loop (a2 is used out of the loop)
2924    3. no uses of a1 in the loop besides the reduction operation
2925    4. no uses of a1 outside the loop.
2926
2927    Conditions 1,4 are tested here.
2928    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2929
2930    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2931    nested cycles.
2932
2933    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2934    reductions:
2935
2936      a1 = phi < a0, a2 >
2937      inner loop (def of a3)
2938      a2 = phi < a3 >
2939
2940    (4) Detect condition expressions, ie:
2941      for (int i = 0; i < N; i++)
2942        if (a[i] < val)
2943         ret_val = a[i];
2944
2945 */
2946
2947 static stmt_vec_info
2948 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
2949                           bool *double_reduc,
2950                           bool need_wrapping_integral_overflow,
2951                           enum vect_reduction_type *v_reduc_type)
2952 {
2953   gphi *phi = as_a <gphi *> (phi_info->stmt);
2954   struct loop *loop = (gimple_bb (phi))->loop_father;
2955   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2956   bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
2957   gimple *phi_use_stmt = NULL;
2958   enum tree_code orig_code, code;
2959   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2960   tree type;
2961   tree name;
2962   imm_use_iterator imm_iter;
2963   use_operand_p use_p;
2964   bool phi_def;
2965
2966   *double_reduc = false;
2967   *v_reduc_type = TREE_CODE_REDUCTION;
2968
2969   tree phi_name = PHI_RESULT (phi);
2970   /* ???  If there are no uses of the PHI result the inner loop reduction
2971      won't be detected as possibly double-reduction by vectorizable_reduction
2972      because that tries to walk the PHI arg from the preheader edge which
2973      can be constant.  See PR60382.  */
2974   if (has_zero_uses (phi_name))
2975     return NULL;
2976   unsigned nphi_def_loop_uses = 0;
2977   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
2978     {
2979       gimple *use_stmt = USE_STMT (use_p);
2980       if (is_gimple_debug (use_stmt))
2981         continue;
2982
2983       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2984         {
2985           if (dump_enabled_p ())
2986             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2987                              "intermediate value used outside loop.\n");
2988
2989           return NULL;
2990         }
2991
2992       nphi_def_loop_uses++;
2993       phi_use_stmt = use_stmt;
2994     }
2995
2996   edge latch_e = loop_latch_edge (loop);
2997   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2998   if (TREE_CODE (loop_arg) != SSA_NAME)
2999     {
3000       if (dump_enabled_p ())
3001         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3002                          "reduction: not ssa_name: %T\n", loop_arg);
3003       return NULL;
3004     }
3005
3006   stmt_vec_info def_stmt_info = loop_info->lookup_def (loop_arg);
3007   if (!def_stmt_info
3008       || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
3009     return NULL;
3010
3011   if (gassign *def_stmt = dyn_cast <gassign *> (def_stmt_info->stmt))
3012     {
3013       name = gimple_assign_lhs (def_stmt);
3014       phi_def = false;
3015     }
3016   else if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
3017     {
3018       name = PHI_RESULT (def_stmt);
3019       phi_def = true;
3020     }
3021   else
3022     {
3023       if (dump_enabled_p ())
3024         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3025                          "reduction: unhandled reduction operation: %G",
3026                          def_stmt_info->stmt);
3027       return NULL;
3028     }
3029
3030   unsigned nlatch_def_loop_uses = 0;
3031   auto_vec<gphi *, 3> lcphis;
3032   bool inner_loop_of_double_reduc = false;
3033   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
3034     {
3035       gimple *use_stmt = USE_STMT (use_p);
3036       if (is_gimple_debug (use_stmt))
3037         continue;
3038       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3039         nlatch_def_loop_uses++;
3040       else
3041         {
3042           /* We can have more than one loop-closed PHI.  */
3043           lcphis.safe_push (as_a <gphi *> (use_stmt));
3044           if (nested_in_vect_loop
3045               && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
3046                   == vect_double_reduction_def))
3047             inner_loop_of_double_reduc = true;
3048         }
3049     }
3050
3051   /* If this isn't a nested cycle or if the nested cycle reduction value
3052      is used ouside of the inner loop we cannot handle uses of the reduction
3053      value.  */
3054   if ((!nested_in_vect_loop || inner_loop_of_double_reduc)
3055       && (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1))
3056     {
3057       if (dump_enabled_p ())
3058         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3059                          "reduction used in loop.\n");
3060       return NULL;
3061     }
3062
3063   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3064      defined in the inner loop.  */
3065   if (phi_def)
3066     {
3067       gphi *def_stmt = as_a <gphi *> (def_stmt_info->stmt);
3068       op1 = PHI_ARG_DEF (def_stmt, 0);
3069
3070       if (gimple_phi_num_args (def_stmt) != 1
3071           || TREE_CODE (op1) != SSA_NAME)
3072         {
3073           if (dump_enabled_p ())
3074             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3075                              "unsupported phi node definition.\n");
3076
3077           return NULL;
3078         }
3079
3080       gimple *def1 = SSA_NAME_DEF_STMT (op1);
3081       if (gimple_bb (def1)
3082           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3083           && loop->inner
3084           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3085           && is_gimple_assign (def1)
3086           && is_a <gphi *> (phi_use_stmt)
3087           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3088         {
3089           if (dump_enabled_p ())
3090             report_vect_op (MSG_NOTE, def_stmt,
3091                             "detected double reduction: ");
3092
3093           *double_reduc = true;
3094           return def_stmt_info;
3095         }
3096
3097       return NULL;
3098     }
3099
3100   /* If we are vectorizing an inner reduction we are executing that
3101      in the original order only in case we are not dealing with a
3102      double reduction.  */
3103   bool check_reduction = true;
3104   if (flow_loop_nested_p (vect_loop, loop))
3105     {
3106       gphi *lcphi;
3107       unsigned i;
3108       check_reduction = false;
3109       FOR_EACH_VEC_ELT (lcphis, i, lcphi)
3110         FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result (lcphi))
3111           {
3112             gimple *use_stmt = USE_STMT (use_p);
3113             if (is_gimple_debug (use_stmt))
3114               continue;
3115             if (! flow_bb_inside_loop_p (vect_loop, gimple_bb (use_stmt)))
3116               check_reduction = true;
3117           }
3118     }
3119
3120   gassign *def_stmt = as_a <gassign *> (def_stmt_info->stmt);
3121   code = orig_code = gimple_assign_rhs_code (def_stmt);
3122
3123   if (nested_in_vect_loop && !check_reduction)
3124     {
3125       /* FIXME: Even for non-reductions code generation is funneled
3126          through vectorizable_reduction for the stmt defining the
3127          PHI latch value.  So we have to artificially restrict ourselves
3128          for the supported operations.  */
3129       switch (get_gimple_rhs_class (code))
3130         {
3131         case GIMPLE_BINARY_RHS:
3132         case GIMPLE_TERNARY_RHS:
3133           break;
3134         default:
3135           /* Not supported by vectorizable_reduction.  */
3136           if (dump_enabled_p ())
3137             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3138                             "nested cycle: not handled operation: ");
3139           return NULL;
3140         }
3141       if (dump_enabled_p ())
3142         report_vect_op (MSG_NOTE, def_stmt, "detected nested cycle: ");
3143       return def_stmt_info;
3144     }
3145
3146   /* We can handle "res -= x[i]", which is non-associative by
3147      simply rewriting this into "res += -x[i]".  Avoid changing
3148      gimple instruction for the first simple tests and only do this
3149      if we're allowed to change code at all.  */
3150   if (code == MINUS_EXPR && gimple_assign_rhs2 (def_stmt) != phi_name)
3151     code = PLUS_EXPR;
3152
3153   if (code == COND_EXPR)
3154     {
3155       if (! nested_in_vect_loop)
3156         *v_reduc_type = COND_REDUCTION;
3157
3158       op3 = gimple_assign_rhs1 (def_stmt);
3159       if (COMPARISON_CLASS_P (op3))
3160         {
3161           op4 = TREE_OPERAND (op3, 1);
3162           op3 = TREE_OPERAND (op3, 0);
3163         }
3164       if (op3 == phi_name || op4 == phi_name)
3165         {
3166           if (dump_enabled_p ())
3167             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3168                             "reduction: condition depends on previous"
3169                             " iteration: ");
3170           return NULL;
3171         }
3172
3173       op1 = gimple_assign_rhs2 (def_stmt);
3174       op2 = gimple_assign_rhs3 (def_stmt);
3175     }
3176   else if (!commutative_tree_code (code) || !associative_tree_code (code))
3177     {
3178       if (dump_enabled_p ())
3179         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3180                         "reduction: not commutative/associative: ");
3181       return NULL;
3182     }
3183   else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS)
3184     {
3185       op1 = gimple_assign_rhs1 (def_stmt);
3186       op2 = gimple_assign_rhs2 (def_stmt);
3187     }
3188   else
3189     {
3190       if (dump_enabled_p ())
3191         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3192                         "reduction: not handled operation: ");
3193       return NULL;
3194     }
3195
3196   if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
3197     {
3198       if (dump_enabled_p ())
3199         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3200                         "reduction: both uses not ssa_names: ");
3201
3202       return NULL;
3203     }
3204
3205   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
3206   if ((TREE_CODE (op1) == SSA_NAME
3207        && !types_compatible_p (type,TREE_TYPE (op1)))
3208       || (TREE_CODE (op2) == SSA_NAME
3209           && !types_compatible_p (type, TREE_TYPE (op2)))
3210       || (op3 && TREE_CODE (op3) == SSA_NAME
3211           && !types_compatible_p (type, TREE_TYPE (op3)))
3212       || (op4 && TREE_CODE (op4) == SSA_NAME
3213           && !types_compatible_p (type, TREE_TYPE (op4))))
3214     {
3215       if (dump_enabled_p ())
3216         {
3217           dump_printf_loc (MSG_NOTE, vect_location,
3218                            "reduction: multiple types: operation type: "
3219                            "%T, operands types: %T,%T",
3220                            type,  TREE_TYPE (op1), TREE_TYPE (op2));
3221           if (op3)
3222             dump_printf (MSG_NOTE, ",%T", TREE_TYPE (op3));
3223
3224           if (op4)
3225             dump_printf (MSG_NOTE, ",%T", TREE_TYPE (op4));
3226           dump_printf (MSG_NOTE, "\n");
3227         }
3228
3229       return NULL;
3230     }
3231
3232   /* Check whether it's ok to change the order of the computation.
3233      Generally, when vectorizing a reduction we change the order of the
3234      computation.  This may change the behavior of the program in some
3235      cases, so we need to check that this is ok.  One exception is when
3236      vectorizing an outer-loop: the inner-loop is executed sequentially,
3237      and therefore vectorizing reductions in the inner-loop during
3238      outer-loop vectorization is safe.  */
3239   if (check_reduction
3240       && *v_reduc_type == TREE_CODE_REDUCTION
3241       && needs_fold_left_reduction_p (type, code,
3242                                       need_wrapping_integral_overflow))
3243     *v_reduc_type = FOLD_LEFT_REDUCTION;
3244
3245   /* Reduction is safe. We're dealing with one of the following:
3246      1) integer arithmetic and no trapv
3247      2) floating point arithmetic, and special flags permit this optimization
3248      3) nested cycle (i.e., outer loop vectorization).  */
3249   stmt_vec_info def1_info = loop_info->lookup_def (op1);
3250   stmt_vec_info def2_info = loop_info->lookup_def (op2);
3251   if (code != COND_EXPR && !def1_info && !def2_info)
3252     {
3253       if (dump_enabled_p ())
3254         report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
3255       return NULL;
3256     }
3257
3258   /* Check that one def is the reduction def, defined by PHI,
3259      the other def is either defined in the loop ("vect_internal_def"),
3260      or it's an induction (defined by a loop-header phi-node).  */
3261
3262   if (def2_info
3263       && def2_info->stmt == phi
3264       && (code == COND_EXPR
3265           || !def1_info
3266           || !flow_bb_inside_loop_p (loop, gimple_bb (def1_info->stmt))
3267           || vect_valid_reduction_input_p (def1_info)))
3268     {
3269       if (dump_enabled_p ())
3270         report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3271       return def_stmt_info;
3272     }
3273
3274   if (def1_info
3275       && def1_info->stmt == phi
3276       && (code == COND_EXPR
3277           || !def2_info
3278           || !flow_bb_inside_loop_p (loop, gimple_bb (def2_info->stmt))
3279           || vect_valid_reduction_input_p (def2_info)))
3280     {
3281       if (! nested_in_vect_loop && orig_code != MINUS_EXPR)
3282         {
3283           /* Check if we can swap operands (just for simplicity - so that
3284              the rest of the code can assume that the reduction variable
3285              is always the last (second) argument).  */
3286           if (code == COND_EXPR)
3287             {
3288               /* Swap cond_expr by inverting the condition.  */
3289               tree cond_expr = gimple_assign_rhs1 (def_stmt);
3290               enum tree_code invert_code = ERROR_MARK;
3291               enum tree_code cond_code = TREE_CODE (cond_expr);
3292
3293               if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
3294                 {
3295                   bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
3296                   invert_code = invert_tree_comparison (cond_code, honor_nans);
3297                 }
3298               if (invert_code != ERROR_MARK)
3299                 {
3300                   TREE_SET_CODE (cond_expr, invert_code);
3301                   swap_ssa_operands (def_stmt,
3302                                      gimple_assign_rhs2_ptr (def_stmt),
3303                                      gimple_assign_rhs3_ptr (def_stmt));
3304                 }
3305               else
3306                 {
3307                   if (dump_enabled_p ())
3308                     report_vect_op (MSG_NOTE, def_stmt,
3309                                     "detected reduction: cannot swap operands "
3310                                     "for cond_expr");
3311                   return NULL;
3312                 }
3313             }
3314           else
3315             swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
3316                                gimple_assign_rhs2_ptr (def_stmt));
3317
3318           if (dump_enabled_p ())
3319             report_vect_op (MSG_NOTE, def_stmt,
3320                             "detected reduction: need to swap operands: ");
3321
3322           if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
3323             LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
3324         }
3325       else
3326         {
3327           if (dump_enabled_p ())
3328             report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3329         }
3330
3331       return def_stmt_info;
3332     }
3333
3334   /* Try to find SLP reduction chain.  */
3335   if (! nested_in_vect_loop
3336       && code != COND_EXPR
3337       && orig_code != MINUS_EXPR
3338       && vect_is_slp_reduction (loop_info, phi, def_stmt))
3339     {
3340       if (dump_enabled_p ())
3341         report_vect_op (MSG_NOTE, def_stmt,
3342                         "reduction: detected reduction chain: ");
3343
3344       return def_stmt_info;
3345     }
3346
3347   /* Look for the expression computing loop_arg from loop PHI result.  */
3348   if (check_reduction_path (vect_location, loop, phi, loop_arg, code))
3349     return def_stmt_info;
3350
3351   if (dump_enabled_p ())
3352     {
3353       report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3354                       "reduction: unknown pattern: ");
3355     }
3356
3357   return NULL;
3358 }
3359
3360 /* Wrapper around vect_is_simple_reduction, which will modify code
3361    in-place if it enables detection of more reductions.  Arguments
3362    as there.  */
3363
3364 stmt_vec_info
3365 vect_force_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3366                              bool *double_reduc,
3367                              bool need_wrapping_integral_overflow)
3368 {
3369   enum vect_reduction_type v_reduc_type;
3370   stmt_vec_info def_info
3371     = vect_is_simple_reduction (loop_info, phi_info, double_reduc,
3372                                 need_wrapping_integral_overflow,
3373                                 &v_reduc_type);
3374   if (def_info)
3375     {
3376       STMT_VINFO_REDUC_TYPE (phi_info) = v_reduc_type;
3377       STMT_VINFO_REDUC_DEF (phi_info) = def_info;
3378       STMT_VINFO_REDUC_TYPE (def_info) = v_reduc_type;
3379       STMT_VINFO_REDUC_DEF (def_info) = phi_info;
3380     }
3381   return def_info;
3382 }
3383
3384 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
3385 int
3386 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3387                              int *peel_iters_epilogue,
3388                              stmt_vector_for_cost *scalar_cost_vec,
3389                              stmt_vector_for_cost *prologue_cost_vec,
3390                              stmt_vector_for_cost *epilogue_cost_vec)
3391 {
3392   int retval = 0;
3393   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3394
3395   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3396     {
3397       *peel_iters_epilogue = assumed_vf / 2;
3398       if (dump_enabled_p ())
3399         dump_printf_loc (MSG_NOTE, vect_location,
3400                          "cost model: epilogue peel iters set to vf/2 "
3401                          "because loop iterations are unknown .\n");
3402
3403       /* If peeled iterations are known but number of scalar loop
3404          iterations are unknown, count a taken branch per peeled loop.  */
3405       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3406                                  NULL, 0, vect_prologue);
3407       retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
3408                                   NULL, 0, vect_epilogue);
3409     }
3410   else
3411     {
3412       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3413       peel_iters_prologue = niters < peel_iters_prologue ?
3414                             niters : peel_iters_prologue;
3415       *peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3416       /* If we need to peel for gaps, but no peeling is required, we have to
3417          peel VF iterations.  */
3418       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3419         *peel_iters_epilogue = assumed_vf;
3420     }
3421
3422   stmt_info_for_cost *si;
3423   int j;
3424   if (peel_iters_prologue)
3425     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3426       retval += record_stmt_cost (prologue_cost_vec,
3427                                   si->count * peel_iters_prologue,
3428                                   si->kind, si->stmt_info, si->misalign,
3429                                   vect_prologue);
3430   if (*peel_iters_epilogue)
3431     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3432       retval += record_stmt_cost (epilogue_cost_vec,
3433                                   si->count * *peel_iters_epilogue,
3434                                   si->kind, si->stmt_info, si->misalign,
3435                                   vect_epilogue);
3436
3437   return retval;
3438 }
3439
3440 /* Function vect_estimate_min_profitable_iters
3441
3442    Return the number of iterations required for the vector version of the
3443    loop to be profitable relative to the cost of the scalar version of the
3444    loop.
3445
3446    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3447    of iterations for vectorization.  -1 value means loop vectorization
3448    is not profitable.  This returned value may be used for dynamic
3449    profitability check.
3450
3451    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3452    for static check against estimated number of iterations.  */
3453
3454 static void
3455 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3456                                     int *ret_min_profitable_niters,
3457                                     int *ret_min_profitable_estimate)
3458 {
3459   int min_profitable_iters;
3460   int min_profitable_estimate;
3461   int peel_iters_prologue;
3462   int peel_iters_epilogue;
3463   unsigned vec_inside_cost = 0;
3464   int vec_outside_cost = 0;
3465   unsigned vec_prologue_cost = 0;
3466   unsigned vec_epilogue_cost = 0;
3467   int scalar_single_iter_cost = 0;
3468   int scalar_outside_cost = 0;
3469   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3470   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3471   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3472
3473   /* Cost model disabled.  */
3474   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3475     {
3476       if (dump_enabled_p ())
3477         dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3478       *ret_min_profitable_niters = 0;
3479       *ret_min_profitable_estimate = 0;
3480       return;
3481     }
3482
3483   /* Requires loop versioning tests to handle misalignment.  */
3484   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3485     {
3486       /*  FIXME: Make cost depend on complexity of individual check.  */
3487       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3488       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3489                             vect_prologue);
3490       if (dump_enabled_p ())
3491         dump_printf (MSG_NOTE,
3492                      "cost model: Adding cost of checks for loop "
3493                      "versioning to treat misalignment.\n");
3494     }
3495
3496   /* Requires loop versioning with alias checks.  */
3497   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3498     {
3499       /*  FIXME: Make cost depend on complexity of individual check.  */
3500       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3501       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3502                             vect_prologue);
3503       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3504       if (len)
3505         /* Count LEN - 1 ANDs and LEN comparisons.  */
3506         (void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt,
3507                               NULL, 0, vect_prologue);
3508       len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3509       if (len)
3510         {
3511           /* Count LEN - 1 ANDs and LEN comparisons.  */
3512           unsigned int nstmts = len * 2 - 1;
3513           /* +1 for each bias that needs adding.  */
3514           for (unsigned int i = 0; i < len; ++i)
3515             if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3516               nstmts += 1;
3517           (void) add_stmt_cost (target_cost_data, nstmts, scalar_stmt,
3518                                 NULL, 0, vect_prologue);
3519         }
3520       if (dump_enabled_p ())
3521         dump_printf (MSG_NOTE,
3522                      "cost model: Adding cost of checks for loop "
3523                      "versioning aliasing.\n");
3524     }
3525
3526   /* Requires loop versioning with niter checks.  */
3527   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3528     {
3529       /*  FIXME: Make cost depend on complexity of individual check.  */
3530       (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3531                             vect_prologue);
3532       if (dump_enabled_p ())
3533         dump_printf (MSG_NOTE,
3534                      "cost model: Adding cost of checks for loop "
3535                      "versioning niters.\n");
3536     }
3537
3538   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3539     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3540                           vect_prologue);
3541
3542   /* Count statements in scalar loop.  Using this as scalar cost for a single
3543      iteration for now.
3544
3545      TODO: Add outer loop support.
3546
3547      TODO: Consider assigning different costs to different scalar
3548      statements.  */
3549
3550   scalar_single_iter_cost
3551     = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3552
3553   /* Add additional cost for the peeled instructions in prologue and epilogue
3554      loop.  (For fully-masked loops there will be no peeling.)
3555
3556      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3557      at compile-time - we assume it's vf/2 (the worst would be vf-1).
3558
3559      TODO: Build an expression that represents peel_iters for prologue and
3560      epilogue to be used in a run-time test.  */
3561
3562   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3563     {
3564       peel_iters_prologue = 0;
3565       peel_iters_epilogue = 0;
3566
3567       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3568         {
3569           /* We need to peel exactly one iteration.  */
3570           peel_iters_epilogue += 1;
3571           stmt_info_for_cost *si;
3572           int j;
3573           FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
3574                             j, si)
3575             (void) add_stmt_cost (target_cost_data, si->count,
3576                                   si->kind, si->stmt_info, si->misalign,
3577                                   vect_epilogue);
3578         }
3579     }
3580   else if (npeel < 0)
3581     {
3582       peel_iters_prologue = assumed_vf / 2;
3583       if (dump_enabled_p ())
3584         dump_printf (MSG_NOTE, "cost model: "
3585                      "prologue peel iters set to vf/2.\n");
3586
3587       /* If peeling for alignment is unknown, loop bound of main loop becomes
3588          unknown.  */
3589       peel_iters_epilogue = assumed_vf / 2;
3590       if (dump_enabled_p ())
3591         dump_printf (MSG_NOTE, "cost model: "
3592                      "epilogue peel iters set to vf/2 because "
3593                      "peeling for alignment is unknown.\n");
3594
3595       /* If peeled iterations are unknown, count a taken branch and a not taken
3596          branch per peeled loop. Even if scalar loop iterations are known,
3597          vector iterations are not known since peeled prologue iterations are
3598          not known. Hence guards remain the same.  */
3599       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3600                             NULL, 0, vect_prologue);
3601       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3602                             NULL, 0, vect_prologue);
3603       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3604                             NULL, 0, vect_epilogue);
3605       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3606                             NULL, 0, vect_epilogue);
3607       stmt_info_for_cost *si;
3608       int j;
3609       FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3610         {
3611           (void) add_stmt_cost (target_cost_data,
3612                                 si->count * peel_iters_prologue,
3613                                 si->kind, si->stmt_info, si->misalign,
3614                                 vect_prologue);
3615           (void) add_stmt_cost (target_cost_data,
3616                                 si->count * peel_iters_epilogue,
3617                                 si->kind, si->stmt_info, si->misalign,
3618                                 vect_epilogue);
3619         }
3620     }
3621   else
3622     {
3623       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3624       stmt_info_for_cost *si;
3625       int j;
3626       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3627
3628       prologue_cost_vec.create (2);
3629       epilogue_cost_vec.create (2);
3630       peel_iters_prologue = npeel;
3631
3632       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3633                                           &peel_iters_epilogue,
3634                                           &LOOP_VINFO_SCALAR_ITERATION_COST
3635                                             (loop_vinfo),
3636                                           &prologue_cost_vec,
3637                                           &epilogue_cost_vec);
3638
3639       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3640         (void) add_stmt_cost (data, si->count, si->kind, si->stmt_info,
3641                               si->misalign, vect_prologue);
3642
3643       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3644         (void) add_stmt_cost (data, si->count, si->kind, si->stmt_info,
3645                               si->misalign, vect_epilogue);
3646
3647       prologue_cost_vec.release ();
3648       epilogue_cost_vec.release ();
3649     }
3650
3651   /* FORNOW: The scalar outside cost is incremented in one of the
3652      following ways:
3653
3654      1. The vectorizer checks for alignment and aliasing and generates
3655      a condition that allows dynamic vectorization.  A cost model
3656      check is ANDED with the versioning condition.  Hence scalar code
3657      path now has the added cost of the versioning check.
3658
3659        if (cost > th & versioning_check)
3660          jmp to vector code
3661
3662      Hence run-time scalar is incremented by not-taken branch cost.
3663
3664      2. The vectorizer then checks if a prologue is required.  If the
3665      cost model check was not done before during versioning, it has to
3666      be done before the prologue check.
3667
3668        if (cost <= th)
3669          prologue = scalar_iters
3670        if (prologue == 0)
3671          jmp to vector code
3672        else
3673          execute prologue
3674        if (prologue == num_iters)
3675          go to exit
3676
3677      Hence the run-time scalar cost is incremented by a taken branch,
3678      plus a not-taken branch, plus a taken branch cost.
3679
3680      3. The vectorizer then checks if an epilogue is required.  If the
3681      cost model check was not done before during prologue check, it
3682      has to be done with the epilogue check.
3683
3684        if (prologue == 0)
3685          jmp to vector code
3686        else
3687          execute prologue
3688        if (prologue == num_iters)
3689          go to exit
3690        vector code:
3691          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3692            jmp to epilogue
3693
3694      Hence the run-time scalar cost should be incremented by 2 taken
3695      branches.
3696
3697      TODO: The back end may reorder the BBS's differently and reverse
3698      conditions/branch directions.  Change the estimates below to
3699      something more reasonable.  */
3700
3701   /* If the number of iterations is known and we do not do versioning, we can
3702      decide whether to vectorize at compile time.  Hence the scalar version
3703      do not carry cost model guard costs.  */
3704   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3705       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3706     {
3707       /* Cost model check occurs at versioning.  */
3708       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3709         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3710       else
3711         {
3712           /* Cost model check occurs at prologue generation.  */
3713           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3714             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3715               + vect_get_stmt_cost (cond_branch_not_taken);
3716           /* Cost model check occurs at epilogue generation.  */
3717           else
3718             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3719         }
3720     }
3721
3722   /* Complete the target-specific cost calculations.  */
3723   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3724                &vec_inside_cost, &vec_epilogue_cost);
3725
3726   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3727
3728   if (dump_enabled_p ())
3729     {
3730       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3731       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
3732                    vec_inside_cost);
3733       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
3734                    vec_prologue_cost);
3735       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
3736                    vec_epilogue_cost);
3737       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
3738                    scalar_single_iter_cost);
3739       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
3740                    scalar_outside_cost);
3741       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
3742                    vec_outside_cost);
3743       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
3744                    peel_iters_prologue);
3745       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
3746                    peel_iters_epilogue);
3747     }
3748
3749   /* Calculate number of iterations required to make the vector version
3750      profitable, relative to the loop bodies only.  The following condition
3751      must hold true:
3752      SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
3753      where
3754      SIC = scalar iteration cost, VIC = vector iteration cost,
3755      VOC = vector outside cost, VF = vectorization factor,
3756      NPEEL = prologue iterations + epilogue iterations,
3757      SOC = scalar outside cost for run time cost model check.  */
3758
3759   int saving_per_viter = (scalar_single_iter_cost * assumed_vf
3760                           - vec_inside_cost);
3761   if (saving_per_viter <= 0)
3762     {
3763       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3764         warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
3765                     "vectorization did not happen for a simd loop");
3766
3767       if (dump_enabled_p ())
3768         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3769                          "cost model: the vector iteration cost = %d "
3770                          "divided by the scalar iteration cost = %d "
3771                          "is greater or equal to the vectorization factor = %d"
3772                          ".\n",
3773                          vec_inside_cost, scalar_single_iter_cost, assumed_vf);
3774       *ret_min_profitable_niters = -1;
3775       *ret_min_profitable_estimate = -1;
3776       return;
3777     }
3778
3779   /* ??? The "if" arm is written to handle all cases; see below for what
3780      we would do for !LOOP_VINFO_FULLY_MASKED_P.  */
3781   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3782     {
3783       /* Rewriting the condition above in terms of the number of
3784          vector iterations (vniters) rather than the number of
3785          scalar iterations (niters) gives:
3786
3787          SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
3788
3789          <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
3790
3791          For integer N, X and Y when X > 0:
3792
3793          N * X > Y <==> N >= (Y /[floor] X) + 1.  */
3794       int outside_overhead = (vec_outside_cost
3795                               - scalar_single_iter_cost * peel_iters_prologue
3796                               - scalar_single_iter_cost * peel_iters_epilogue
3797                               - scalar_outside_cost);
3798       /* We're only interested in cases that require at least one
3799          vector iteration.  */
3800       int min_vec_niters = 1;
3801       if (outside_overhead > 0)
3802         min_vec_niters = outside_overhead / saving_per_viter + 1;
3803
3804       if (dump_enabled_p ())
3805         dump_printf (MSG_NOTE, "  Minimum number of vector iterations: %d\n",
3806                      min_vec_niters);
3807
3808       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3809         {
3810           /* Now that we know the minimum number of vector iterations,
3811              find the minimum niters for which the scalar cost is larger:
3812
3813              SIC * niters > VIC * vniters + VOC - SOC
3814
3815              We know that the minimum niters is no more than
3816              vniters * VF + NPEEL, but it might be (and often is) less
3817              than that if a partial vector iteration is cheaper than the
3818              equivalent scalar code.  */
3819           int threshold = (vec_inside_cost * min_vec_niters
3820                            + vec_outside_cost
3821                            - scalar_outside_cost);
3822           if (threshold <= 0)
3823             min_profitable_iters = 1;
3824           else
3825             min_profitable_iters = threshold / scalar_single_iter_cost + 1;
3826         }
3827       else
3828         /* Convert the number of vector iterations into a number of
3829            scalar iterations.  */
3830         min_profitable_iters = (min_vec_niters * assumed_vf
3831                                 + peel_iters_prologue
3832                                 + peel_iters_epilogue);
3833     }
3834   else
3835     {
3836       min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
3837                               * assumed_vf
3838                               - vec_inside_cost * peel_iters_prologue
3839                               - vec_inside_cost * peel_iters_epilogue);
3840       if (min_profitable_iters <= 0)
3841         min_profitable_iters = 0;
3842       else
3843         {
3844           min_profitable_iters /= saving_per_viter;
3845
3846           if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
3847               <= (((int) vec_inside_cost * min_profitable_iters)
3848                   + (((int) vec_outside_cost - scalar_outside_cost)
3849                      * assumed_vf)))
3850             min_profitable_iters++;
3851         }
3852     }
3853
3854   if (dump_enabled_p ())
3855     dump_printf (MSG_NOTE,
3856                  "  Calculated minimum iters for profitability: %d\n",
3857                  min_profitable_iters);
3858
3859   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
3860       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
3861     /* We want the vectorized loop to execute at least once.  */
3862     min_profitable_iters = assumed_vf + peel_iters_prologue;
3863
3864   if (dump_enabled_p ())
3865     dump_printf_loc (MSG_NOTE, vect_location,
3866                      "  Runtime profitability threshold = %d\n",
3867                      min_profitable_iters);
3868
3869   *ret_min_profitable_niters = min_profitable_iters;
3870
3871   /* Calculate number of iterations required to make the vector version
3872      profitable, relative to the loop bodies only.
3873
3874      Non-vectorized variant is SIC * niters and it must win over vector
3875      variant on the expected loop trip count.  The following condition must hold true:
3876      SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC  */
3877
3878   if (vec_outside_cost <= 0)
3879     min_profitable_estimate = 0;
3880   else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3881     {
3882       /* This is a repeat of the code above, but with + SOC rather
3883          than - SOC.  */
3884       int outside_overhead = (vec_outside_cost
3885                               - scalar_single_iter_cost * peel_iters_prologue
3886                               - scalar_single_iter_cost * peel_iters_epilogue
3887                               + scalar_outside_cost);
3888       int min_vec_niters = 1;
3889       if (outside_overhead > 0)
3890         min_vec_niters = outside_overhead / saving_per_viter + 1;
3891
3892       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3893         {
3894           int threshold = (vec_inside_cost * min_vec_niters
3895                            + vec_outside_cost
3896                            + scalar_outside_cost);
3897           min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
3898         }
3899       else
3900         min_profitable_estimate = (min_vec_niters * assumed_vf
3901                                    + peel_iters_prologue
3902                                    + peel_iters_epilogue);
3903     }
3904   else
3905     {
3906       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
3907                                  * assumed_vf
3908                                  - vec_inside_cost * peel_iters_prologue
3909                                  - vec_inside_cost * peel_iters_epilogue)
3910                                  / ((scalar_single_iter_cost * assumed_vf)
3911                                    - vec_inside_cost);
3912     }
3913   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3914   if (dump_enabled_p ())
3915     dump_printf_loc (MSG_NOTE, vect_location,
3916                      "  Static estimate profitability threshold = %d\n",
3917                      min_profitable_estimate);
3918
3919   *ret_min_profitable_estimate = min_profitable_estimate;
3920 }
3921
3922 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3923    vector elements (not bits) for a vector with NELT elements.  */
3924 static void
3925 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
3926                               vec_perm_builder *sel)
3927 {
3928   /* The encoding is a single stepped pattern.  Any wrap-around is handled
3929      by vec_perm_indices.  */
3930   sel->new_vector (nelt, 1, 3);
3931   for (unsigned int i = 0; i < 3; i++)
3932     sel->quick_push (i + offset);
3933 }
3934
3935 /* Checks whether the target supports whole-vector shifts for vectors of mode
3936    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
3937    it supports vec_perm_const with masks for all necessary shift amounts.  */
3938 static bool
3939 have_whole_vector_shift (machine_mode mode)
3940 {
3941   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3942     return true;
3943
3944   /* Variable-length vectors should be handled via the optab.  */
3945   unsigned int nelt;
3946   if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
3947     return false;
3948
3949   vec_perm_builder sel;
3950   vec_perm_indices indices;
3951   for (unsigned int i = nelt / 2; i >= 1; i /= 2)
3952     {
3953       calc_vec_perm_mask_for_shift (i, nelt, &sel);
3954       indices.new_vector (sel, 2, nelt);
3955       if (!can_vec_perm_const_p (mode, indices, false))
3956         return false;
3957     }
3958   return true;
3959 }
3960
3961 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3962    functions. Design better to avoid maintenance issues.  */
3963
3964 /* Function vect_model_reduction_cost.
3965
3966    Models cost for a reduction operation, including the vector ops
3967    generated within the strip-mine loop, the initial definition before
3968    the loop, and the epilogue code that must be generated.  */
3969
3970 static void
3971 vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
3972                            int ncopies, stmt_vector_for_cost *cost_vec)
3973 {
3974   int prologue_cost = 0, epilogue_cost = 0, inside_cost;
3975   enum tree_code code;
3976   optab optab;
3977   tree vectype;
3978   machine_mode mode;
3979   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3980   struct loop *loop = NULL;
3981
3982   if (loop_vinfo)
3983     loop = LOOP_VINFO_LOOP (loop_vinfo);
3984
3985   /* Condition reductions generate two reductions in the loop.  */
3986   vect_reduction_type reduction_type
3987     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
3988   if (reduction_type == COND_REDUCTION)
3989     ncopies *= 2;
3990
3991   vectype = STMT_VINFO_VECTYPE (stmt_info);
3992   mode = TYPE_MODE (vectype);
3993   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
3994
3995   code = gimple_assign_rhs_code (orig_stmt_info->stmt);
3996
3997   if (reduction_type == EXTRACT_LAST_REDUCTION
3998       || reduction_type == FOLD_LEFT_REDUCTION)
3999     {
4000       /* No extra instructions needed in the prologue.  */
4001       prologue_cost = 0;
4002
4003       if (reduction_type == EXTRACT_LAST_REDUCTION || reduc_fn != IFN_LAST)
4004         /* Count one reduction-like operation per vector.  */
4005         inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
4006                                         stmt_info, 0, vect_body);
4007       else
4008         {
4009           /* Use NELEMENTS extracts and NELEMENTS scalar ops.  */
4010           unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
4011           inside_cost = record_stmt_cost (cost_vec, nelements,
4012                                           vec_to_scalar, stmt_info, 0,
4013                                           vect_body);
4014           inside_cost += record_stmt_cost (cost_vec, nelements,
4015                                            scalar_stmt, stmt_info, 0,
4016                                            vect_body);
4017         }
4018     }
4019   else
4020     {
4021       /* Add in cost for initial definition.
4022          For cond reduction we have four vectors: initial index, step,
4023          initial result of the data reduction, initial value of the index
4024          reduction.  */
4025       int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
4026       prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
4027                                          scalar_to_vec, stmt_info, 0,
4028                                          vect_prologue);
4029
4030       /* Cost of reduction op inside loop.  */
4031       inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
4032                                       stmt_info, 0, vect_body);
4033     }
4034
4035   /* Determine cost of epilogue code.
4036
4037      We have a reduction operator that will reduce the vector in one statement.
4038      Also requires scalar extract.  */
4039
4040   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
4041     {
4042       if (reduc_fn != IFN_LAST)
4043         {
4044           if (reduction_type == COND_REDUCTION)
4045             {
4046               /* An EQ stmt and an COND_EXPR stmt.  */
4047               epilogue_cost += record_stmt_cost (cost_vec, 2,
4048                                                  vector_stmt, stmt_info, 0,
4049                                                  vect_epilogue);
4050               /* Reduction of the max index and a reduction of the found
4051                  values.  */
4052               epilogue_cost += record_stmt_cost (cost_vec, 2,
4053                                                  vec_to_scalar, stmt_info, 0,
4054                                                  vect_epilogue);
4055               /* A broadcast of the max value.  */
4056               epilogue_cost += record_stmt_cost (cost_vec, 1,
4057                                                  scalar_to_vec, stmt_info, 0,
4058                                                  vect_epilogue);
4059             }
4060           else
4061             {
4062               epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
4063                                                  stmt_info, 0, vect_epilogue);
4064               epilogue_cost += record_stmt_cost (cost_vec, 1,
4065                                                  vec_to_scalar, stmt_info, 0,
4066                                                  vect_epilogue);
4067             }
4068         }
4069       else if (reduction_type == COND_REDUCTION)
4070         {
4071           unsigned estimated_nunits = vect_nunits_for_cost (vectype);
4072           /* Extraction of scalar elements.  */
4073           epilogue_cost += record_stmt_cost (cost_vec,
4074                                              2 * estimated_nunits,
4075                                              vec_to_scalar, stmt_info, 0,
4076                                              vect_epilogue);
4077           /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
4078           epilogue_cost += record_stmt_cost (cost_vec,
4079                                              2 * estimated_nunits - 3,
4080                                              scalar_stmt, stmt_info, 0,
4081                                              vect_epilogue);
4082         }
4083       else if (reduction_type == EXTRACT_LAST_REDUCTION
4084                || reduction_type == FOLD_LEFT_REDUCTION)
4085         /* No extra instructions need in the epilogue.  */
4086         ;
4087       else
4088         {
4089           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4090           tree bitsize =
4091             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt_info->stmt)));
4092           int element_bitsize = tree_to_uhwi (bitsize);
4093           int nelements = vec_size_in_bits / element_bitsize;
4094
4095           if (code == COND_EXPR)
4096             code = MAX_EXPR;
4097
4098           optab = optab_for_tree_code (code, vectype, optab_default);
4099
4100           /* We have a whole vector shift available.  */
4101           if (optab != unknown_optab
4102               && VECTOR_MODE_P (mode)
4103               && optab_handler (optab, mode) != CODE_FOR_nothing
4104               && have_whole_vector_shift (mode))
4105             {
4106               /* Final reduction via vector shifts and the reduction operator.
4107                  Also requires scalar extract.  */
4108               epilogue_cost += record_stmt_cost (cost_vec,
4109                                                  exact_log2 (nelements) * 2,
4110                                                  vector_stmt, stmt_info, 0,
4111                                                  vect_epilogue);
4112               epilogue_cost += record_stmt_cost (cost_vec, 1,
4113                                                  vec_to_scalar, stmt_info, 0,
4114                                                  vect_epilogue);
4115             }
4116           else
4117             /* Use extracts and reduction op for final reduction.  For N
4118                elements, we have N extracts and N-1 reduction ops.  */
4119             epilogue_cost += record_stmt_cost (cost_vec,
4120                                                nelements + nelements - 1,
4121                                                vector_stmt, stmt_info, 0,
4122                                                vect_epilogue);
4123         }
4124     }
4125
4126   if (dump_enabled_p ())
4127     dump_printf (MSG_NOTE,
4128                  "vect_model_reduction_cost: inside_cost = %d, "
4129                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4130                  prologue_cost, epilogue_cost);
4131 }
4132
4133
4134 /* Function vect_model_induction_cost.
4135
4136    Models cost for induction operations.  */
4137
4138 static void
4139 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies,
4140                            stmt_vector_for_cost *cost_vec)
4141 {
4142   unsigned inside_cost, prologue_cost;
4143
4144   if (PURE_SLP_STMT (stmt_info))
4145     return;
4146
4147   /* loop cost for vec_loop.  */
4148   inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
4149                                   stmt_info, 0, vect_body);
4150
4151   /* prologue cost for vec_init and vec_step.  */
4152   prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
4153                                     stmt_info, 0, vect_prologue);
4154
4155   if (dump_enabled_p ())
4156     dump_printf_loc (MSG_NOTE, vect_location,
4157                      "vect_model_induction_cost: inside_cost = %d, "
4158                      "prologue_cost = %d .\n", inside_cost, prologue_cost);
4159 }
4160
4161
4162
4163 /* Function get_initial_def_for_reduction
4164
4165    Input:
4166    STMT_VINFO - a stmt that performs a reduction operation in the loop.
4167    INIT_VAL - the initial value of the reduction variable
4168
4169    Output:
4170    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4171         of the reduction (used for adjusting the epilog - see below).
4172    Return a vector variable, initialized according to the operation that
4173         STMT_VINFO performs. This vector will be used as the initial value
4174         of the vector of partial results.
4175
4176    Option1 (adjust in epilog): Initialize the vector as follows:
4177      add/bit or/xor:    [0,0,...,0,0]
4178      mult/bit and:      [1,1,...,1,1]
4179      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4180    and when necessary (e.g. add/mult case) let the caller know
4181    that it needs to adjust the result by init_val.
4182
4183    Option2: Initialize the vector as follows:
4184      add/bit or/xor:    [init_val,0,0,...,0]
4185      mult/bit and:      [init_val,1,1,...,1]
4186      min/max/cond_expr: [init_val,init_val,...,init_val]
4187    and no adjustments are needed.
4188
4189    For example, for the following code:
4190
4191    s = init_val;
4192    for (i=0;i<n;i++)
4193      s = s + a[i];
4194
4195    STMT_VINFO is 's = s + a[i]', and the reduction variable is 's'.
4196    For a vector of 4 units, we want to return either [0,0,0,init_val],
4197    or [0,0,0,0] and let the caller know that it needs to adjust
4198    the result at the end by 'init_val'.
4199
4200    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4201    initialization vector is simpler (same element in all entries), if
4202    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4203
4204    A cost model should help decide between these two schemes.  */
4205
4206 tree
4207 get_initial_def_for_reduction (stmt_vec_info stmt_vinfo, tree init_val,
4208                                tree *adjustment_def)
4209 {
4210   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
4211   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4212   tree scalar_type = TREE_TYPE (init_val);
4213   tree vectype = get_vectype_for_scalar_type (scalar_type);
4214   enum tree_code code = gimple_assign_rhs_code (stmt_vinfo->stmt);
4215   tree def_for_init;
4216   tree init_def;
4217   REAL_VALUE_TYPE real_init_val = dconst0;
4218   int int_init_val = 0;
4219   gimple_seq stmts = NULL;
4220
4221   gcc_assert (vectype);
4222
4223   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4224               || SCALAR_FLOAT_TYPE_P (scalar_type));
4225
4226   gcc_assert (nested_in_vect_loop_p (loop, stmt_vinfo)
4227               || loop == (gimple_bb (stmt_vinfo->stmt))->loop_father);
4228
4229   vect_reduction_type reduction_type
4230     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo);
4231
4232   switch (code)
4233     {
4234     case WIDEN_SUM_EXPR:
4235     case DOT_PROD_EXPR:
4236     case SAD_EXPR:
4237     case PLUS_EXPR:
4238     case MINUS_EXPR:
4239     case BIT_IOR_EXPR:
4240     case BIT_XOR_EXPR:
4241     case MULT_EXPR:
4242     case BIT_AND_EXPR:
4243       {
4244         /* ADJUSTMENT_DEF is NULL when called from
4245            vect_create_epilog_for_reduction to vectorize double reduction.  */
4246         if (adjustment_def)
4247           *adjustment_def = init_val;
4248
4249         if (code == MULT_EXPR)
4250           {
4251             real_init_val = dconst1;
4252             int_init_val = 1;
4253           }
4254
4255         if (code == BIT_AND_EXPR)
4256           int_init_val = -1;
4257
4258         if (SCALAR_FLOAT_TYPE_P (scalar_type))
4259           def_for_init = build_real (scalar_type, real_init_val);
4260         else
4261           def_for_init = build_int_cst (scalar_type, int_init_val);
4262
4263         if (adjustment_def)
4264           /* Option1: the first element is '0' or '1' as well.  */
4265           init_def = gimple_build_vector_from_val (&stmts, vectype,
4266                                                    def_for_init);
4267         else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4268           {
4269             /* Option2 (variable length): the first element is INIT_VAL.  */
4270             init_def = gimple_build_vector_from_val (&stmts, vectype,
4271                                                      def_for_init);
4272             init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4273                                      vectype, init_def, init_val);
4274           }
4275         else
4276           {
4277             /* Option2: the first element is INIT_VAL.  */
4278             tree_vector_builder elts (vectype, 1, 2);
4279             elts.quick_push (init_val);
4280             elts.quick_push (def_for_init);
4281             init_def = gimple_build_vector (&stmts, &elts);
4282           }
4283       }
4284       break;
4285
4286     case MIN_EXPR:
4287     case MAX_EXPR:
4288     case COND_EXPR:
4289       {
4290         if (adjustment_def)
4291           {
4292             *adjustment_def = NULL_TREE;
4293             if (reduction_type != COND_REDUCTION
4294                 && reduction_type != EXTRACT_LAST_REDUCTION)
4295               {
4296                 init_def = vect_get_vec_def_for_operand (init_val, stmt_vinfo);
4297                 break;
4298               }
4299           }
4300         init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4301         init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4302       }
4303       break;
4304
4305     default:
4306       gcc_unreachable ();
4307     }
4308
4309   if (stmts)
4310     gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4311   return init_def;
4312 }
4313
4314 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4315    NUMBER_OF_VECTORS is the number of vector defs to create.
4316    If NEUTRAL_OP is nonnull, introducing extra elements of that
4317    value will not change the result.  */
4318
4319 static void
4320 get_initial_defs_for_reduction (slp_tree slp_node,
4321                                 vec<tree> *vec_oprnds,
4322                                 unsigned int number_of_vectors,
4323                                 bool reduc_chain, tree neutral_op)
4324 {
4325   vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4326   stmt_vec_info stmt_vinfo = stmts[0];
4327   unsigned HOST_WIDE_INT nunits;
4328   unsigned j, number_of_places_left_in_vector;
4329   tree vector_type;
4330   unsigned int group_size = stmts.length ();
4331   unsigned int i;
4332   struct loop *loop;
4333
4334   vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4335
4336   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4337
4338   loop = (gimple_bb (stmt_vinfo->stmt))->loop_father;
4339   gcc_assert (loop);
4340   edge pe = loop_preheader_edge (loop);
4341
4342   gcc_assert (!reduc_chain || neutral_op);
4343
4344   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4345      created vectors. It is greater than 1 if unrolling is performed.
4346
4347      For example, we have two scalar operands, s1 and s2 (e.g., group of
4348      strided accesses of size two), while NUNITS is four (i.e., four scalars
4349      of this type can be packed in a vector).  The output vector will contain
4350      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
4351      will be 2).
4352
4353      If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4354      vectors containing the operands.
4355
4356      For example, NUNITS is four as before, and the group size is 8
4357      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
4358      {s5, s6, s7, s8}.  */
4359
4360   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4361     nunits = group_size;
4362
4363   number_of_places_left_in_vector = nunits;
4364   bool constant_p = true;
4365   tree_vector_builder elts (vector_type, nunits, 1);
4366   elts.quick_grow (nunits);
4367   gimple_seq ctor_seq = NULL;
4368   for (j = 0; j < nunits * number_of_vectors; ++j)
4369     {
4370       tree op;
4371       i = j % group_size;
4372       stmt_vinfo = stmts[i];
4373
4374       /* Get the def before the loop.  In reduction chain we have only
4375          one initial value.  Else we have as many as PHIs in the group.  */
4376       if (reduc_chain)
4377         op = j != 0 ? neutral_op : PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4378       else if (((vec_oprnds->length () + 1) * nunits
4379                 - number_of_places_left_in_vector >= group_size)
4380                && neutral_op)
4381         op = neutral_op;
4382       else
4383         op = PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4384
4385       /* Create 'vect_ = {op0,op1,...,opn}'.  */
4386       number_of_places_left_in_vector--;
4387       elts[nunits - number_of_places_left_in_vector - 1] = op;
4388       if (!CONSTANT_CLASS_P (op))
4389         constant_p = false;
4390
4391       if (number_of_places_left_in_vector == 0)
4392         {
4393           tree init;
4394           if (constant_p && !neutral_op
4395               ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4396               : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4397             /* Build the vector directly from ELTS.  */
4398             init = gimple_build_vector (&ctor_seq, &elts);
4399           else if (neutral_op)
4400             {
4401               /* Build a vector of the neutral value and shift the
4402                  other elements into place.  */
4403               init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4404                                                    neutral_op);
4405               int k = nunits;
4406               while (k > 0 && elts[k - 1] == neutral_op)
4407                 k -= 1;
4408               while (k > 0)
4409                 {
4410                   k -= 1;
4411                   init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4412                                        vector_type, init, elts[k]);
4413                 }
4414             }
4415           else
4416             {
4417               /* First time round, duplicate ELTS to fill the
4418                  required number of vectors.  */
4419               duplicate_and_interleave (&ctor_seq, vector_type, elts,
4420                                         number_of_vectors, *vec_oprnds);
4421               break;
4422             }
4423           vec_oprnds->quick_push (init);
4424
4425           number_of_places_left_in_vector = nunits;
4426           elts.new_vector (vector_type, nunits, 1);
4427           elts.quick_grow (nunits);
4428           constant_p = true;
4429         }
4430     }
4431   if (ctor_seq != NULL)
4432     gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4433 }
4434
4435
4436 /* Function vect_create_epilog_for_reduction
4437
4438    Create code at the loop-epilog to finalize the result of a reduction
4439    computation.
4440
4441    VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4442      reduction statements.
4443    STMT_INFO is the scalar reduction stmt that is being vectorized.
4444    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4445      number of elements that we can fit in a vectype (nunits).  In this case
4446      we have to generate more than one vector stmt - i.e - we need to "unroll"
4447      the vector stmt by a factor VF/nunits.  For more details see documentation
4448      in vectorizable_operation.
4449    REDUC_FN is the internal function for the epilog reduction.
4450    REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4451      computation.
4452    REDUC_INDEX is the index of the operand in the right hand side of the
4453      statement that is defined by REDUCTION_PHI.
4454    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4455    SLP_NODE is an SLP node containing a group of reduction statements. The
4456      first one in this group is STMT_INFO.
4457    INDUC_VAL is for INTEGER_INDUC_COND_REDUCTION the value to use for the case
4458      when the COND_EXPR is never true in the loop.  For MAX_EXPR, it needs to
4459      be smaller than any value of the IV in the loop, for MIN_EXPR larger than
4460      any value of the IV in the loop.
4461    INDUC_CODE is the code for epilog reduction if INTEGER_INDUC_COND_REDUCTION.
4462    NEUTRAL_OP is the value given by neutral_op_for_slp_reduction; it is
4463      null if this is not an SLP reduction
4464
4465    This function:
4466    1. Creates the reduction def-use cycles: sets the arguments for
4467       REDUCTION_PHIS:
4468       The loop-entry argument is the vectorized initial-value of the reduction.
4469       The loop-latch argument is taken from VECT_DEFS - the vector of partial
4470       sums.
4471    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4472       by calling the function specified by REDUC_FN if available, or by
4473       other means (whole-vector shifts or a scalar loop).
4474       The function also creates a new phi node at the loop exit to preserve
4475       loop-closed form, as illustrated below.
4476
4477      The flow at the entry to this function:
4478
4479         loop:
4480           vec_def = phi <null, null>            # REDUCTION_PHI
4481           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
4482           s_loop = scalar_stmt                  # (scalar) STMT_INFO
4483         loop_exit:
4484           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4485           use <s_out0>
4486           use <s_out0>
4487
4488      The above is transformed by this function into:
4489
4490         loop:
4491           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4492           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
4493           s_loop = scalar_stmt                  # (scalar) STMT_INFO
4494         loop_exit:
4495           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4496           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4497           v_out2 = reduce <v_out1>
4498           s_out3 = extract_field <v_out2, 0>
4499           s_out4 = adjust_result <s_out3>
4500           use <s_out4>
4501           use <s_out4>
4502 */
4503
4504 static void
4505 vect_create_epilog_for_reduction (vec<tree> vect_defs,
4506                                   stmt_vec_info stmt_info,
4507                                   gimple *reduc_def_stmt,
4508                                   int ncopies, internal_fn reduc_fn,
4509                                   vec<stmt_vec_info> reduction_phis,
4510                                   bool double_reduc,
4511                                   slp_tree slp_node,
4512                                   slp_instance slp_node_instance,
4513                                   tree induc_val, enum tree_code induc_code,
4514                                   tree neutral_op)
4515 {
4516   stmt_vec_info prev_phi_info;
4517   tree vectype;
4518   machine_mode mode;
4519   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4520   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4521   basic_block exit_bb;
4522   tree scalar_dest;
4523   tree scalar_type;
4524   gimple *new_phi = NULL, *phi;
4525   stmt_vec_info phi_info;
4526   gimple_stmt_iterator exit_gsi;
4527   tree vec_dest;
4528   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4529   gimple *epilog_stmt = NULL;
4530   enum tree_code code = gimple_assign_rhs_code (stmt_info->stmt);
4531   gimple *exit_phi;
4532   tree bitsize;
4533   tree adjustment_def = NULL;
4534   tree vec_initial_def = NULL;
4535   tree expr, def, initial_def = NULL;
4536   tree orig_name, scalar_result;
4537   imm_use_iterator imm_iter, phi_imm_iter;
4538   use_operand_p use_p, phi_use_p;
4539   gimple *use_stmt;
4540   stmt_vec_info reduction_phi_info = NULL;
4541   bool nested_in_vect_loop = false;
4542   auto_vec<gimple *> new_phis;
4543   auto_vec<stmt_vec_info> inner_phis;
4544   int j, i;
4545   auto_vec<tree> scalar_results;
4546   unsigned int group_size = 1, k, ratio;
4547   auto_vec<tree> vec_initial_defs;
4548   auto_vec<gimple *> phis;
4549   bool slp_reduc = false;
4550   bool direct_slp_reduc;
4551   tree new_phi_result;
4552   stmt_vec_info inner_phi = NULL;
4553   tree induction_index = NULL_TREE;
4554
4555   if (slp_node)
4556     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4557
4558   if (nested_in_vect_loop_p (loop, stmt_info))
4559     {
4560       outer_loop = loop;
4561       loop = loop->inner;
4562       nested_in_vect_loop = true;
4563       gcc_assert (!slp_node);
4564     }
4565
4566   vectype = STMT_VINFO_VECTYPE (stmt_info);
4567   gcc_assert (vectype);
4568   mode = TYPE_MODE (vectype);
4569
4570   /* 1. Create the reduction def-use cycle:
4571      Set the arguments of REDUCTION_PHIS, i.e., transform
4572
4573         loop:
4574           vec_def = phi <null, null>            # REDUCTION_PHI
4575           VECT_DEF = vector_stmt                # vectorized form of STMT
4576           ...
4577
4578      into:
4579
4580         loop:
4581           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4582           VECT_DEF = vector_stmt                # vectorized form of STMT
4583           ...
4584
4585      (in case of SLP, do it for all the phis). */
4586
4587   /* Get the loop-entry arguments.  */
4588   enum vect_def_type initial_def_dt = vect_unknown_def_type;
4589   if (slp_node)
4590     {
4591       unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4592       vec_initial_defs.reserve (vec_num);
4593       get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
4594                                       &vec_initial_defs, vec_num,
4595                                       REDUC_GROUP_FIRST_ELEMENT (stmt_info),
4596                                       neutral_op);
4597     }
4598   else
4599     {
4600       /* Get at the scalar def before the loop, that defines the initial value
4601          of the reduction variable.  */
4602       initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4603                                            loop_preheader_edge (loop));
4604       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
4605          and we can't use zero for induc_val, use initial_def.  Similarly
4606          for REDUC_MIN and initial_def larger than the base.  */
4607       if (TREE_CODE (initial_def) == INTEGER_CST
4608           && (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4609               == INTEGER_INDUC_COND_REDUCTION)
4610           && !integer_zerop (induc_val)
4611           && ((induc_code == MAX_EXPR
4612                && tree_int_cst_lt (initial_def, induc_val))
4613               || (induc_code == MIN_EXPR
4614                   && tree_int_cst_lt (induc_val, initial_def))))
4615         induc_val = initial_def;
4616
4617       if (double_reduc)
4618         /* In case of double reduction we only create a vector variable
4619            to be put in the reduction phi node.  The actual statement
4620            creation is done later in this function.  */
4621         vec_initial_def = vect_create_destination_var (initial_def, vectype);
4622       else if (nested_in_vect_loop)
4623         {
4624           /* Do not use an adjustment def as that case is not supported
4625              correctly if ncopies is not one.  */
4626           vect_is_simple_use (initial_def, loop_vinfo, &initial_def_dt);
4627           vec_initial_def = vect_get_vec_def_for_operand (initial_def,
4628                                                           stmt_info);
4629         }
4630       else
4631         vec_initial_def
4632           = get_initial_def_for_reduction (stmt_info, initial_def,
4633                                            &adjustment_def);
4634       vec_initial_defs.create (1);
4635       vec_initial_defs.quick_push (vec_initial_def);
4636     }
4637
4638   /* Set phi nodes arguments.  */
4639   FOR_EACH_VEC_ELT (reduction_phis, i, phi_info)
4640     {
4641       tree vec_init_def = vec_initial_defs[i];
4642       tree def = vect_defs[i];
4643       for (j = 0; j < ncopies; j++)
4644         {
4645           if (j != 0)
4646             {
4647               phi_info = STMT_VINFO_RELATED_STMT (phi_info);
4648               if (nested_in_vect_loop)
4649                 vec_init_def
4650                   = vect_get_vec_def_for_stmt_copy (loop_vinfo, vec_init_def);
4651             }
4652
4653           /* Set the loop-entry arg of the reduction-phi.  */
4654
4655           gphi *phi = as_a <gphi *> (phi_info->stmt);
4656           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4657               == INTEGER_INDUC_COND_REDUCTION)
4658             {
4659               /* Initialise the reduction phi to zero.  This prevents initial
4660                  values of non-zero interferring with the reduction op.  */
4661               gcc_assert (ncopies == 1);
4662               gcc_assert (i == 0);
4663
4664               tree vec_init_def_type = TREE_TYPE (vec_init_def);
4665               tree induc_val_vec
4666                 = build_vector_from_val (vec_init_def_type, induc_val);
4667
4668               add_phi_arg (phi, induc_val_vec, loop_preheader_edge (loop),
4669                            UNKNOWN_LOCATION);
4670             }
4671           else
4672             add_phi_arg (phi, vec_init_def, loop_preheader_edge (loop),
4673                          UNKNOWN_LOCATION);
4674
4675           /* Set the loop-latch arg for the reduction-phi.  */
4676           if (j > 0)
4677             def = vect_get_vec_def_for_stmt_copy (loop_vinfo, def);
4678
4679           add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
4680
4681           if (dump_enabled_p ())
4682             dump_printf_loc (MSG_NOTE, vect_location,
4683                              "transform reduction: created def-use cycle: %G%G",
4684                              phi, SSA_NAME_DEF_STMT (def));
4685         }
4686     }
4687
4688   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4689      which is updated with the current index of the loop for every match of
4690      the original loop's cond_expr (VEC_STMT).  This results in a vector
4691      containing the last time the condition passed for that vector lane.
4692      The first match will be a 1 to allow 0 to be used for non-matching
4693      indexes.  If there are no matches at all then the vector will be all
4694      zeroes.  */
4695   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4696     {
4697       tree indx_before_incr, indx_after_incr;
4698       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4699
4700       gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info)->stmt;
4701       gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4702
4703       int scalar_precision
4704         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4705       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4706       tree cr_index_vector_type = build_vector_type
4707         (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype));
4708
4709       /* First we create a simple vector induction variable which starts
4710          with the values {1,2,3,...} (SERIES_VECT) and increments by the
4711          vector size (STEP).  */
4712
4713       /* Create a {1,2,3,...} vector.  */
4714       tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
4715
4716       /* Create a vector of the step value.  */
4717       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4718       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4719
4720       /* Create an induction variable.  */
4721       gimple_stmt_iterator incr_gsi;
4722       bool insert_after;
4723       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4724       create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4725                  insert_after, &indx_before_incr, &indx_after_incr);
4726
4727       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4728          filled with zeros (VEC_ZERO).  */
4729
4730       /* Create a vector of 0s.  */
4731       tree zero = build_zero_cst (cr_index_scalar_type);
4732       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4733
4734       /* Create a vector phi node.  */
4735       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4736       new_phi = create_phi_node (new_phi_tree, loop->header);
4737       loop_vinfo->add_stmt (new_phi);
4738       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4739                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
4740
4741       /* Now take the condition from the loops original cond_expr
4742          (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
4743          every match uses values from the induction variable
4744          (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4745          (NEW_PHI_TREE).
4746          Finally, we update the phi (NEW_PHI_TREE) to take the value of
4747          the new cond_expr (INDEX_COND_EXPR).  */
4748
4749       /* Duplicate the condition from vec_stmt.  */
4750       tree ccompare = unshare_expr (gimple_assign_rhs1 (vec_stmt));
4751
4752       /* Create a conditional, where the condition is taken from vec_stmt
4753          (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and
4754          else is the phi (NEW_PHI_TREE).  */
4755       tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
4756                                      ccompare, indx_before_incr,
4757                                      new_phi_tree);
4758       induction_index = make_ssa_name (cr_index_vector_type);
4759       gimple *index_condition = gimple_build_assign (induction_index,
4760                                                      index_cond_expr);
4761       gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
4762       stmt_vec_info index_vec_info = loop_vinfo->add_stmt (index_condition);
4763       STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
4764
4765       /* Update the phi with the vec cond.  */
4766       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4767                    loop_latch_edge (loop), UNKNOWN_LOCATION);
4768     }
4769
4770   /* 2. Create epilog code.
4771         The reduction epilog code operates across the elements of the vector
4772         of partial results computed by the vectorized loop.
4773         The reduction epilog code consists of:
4774
4775         step 1: compute the scalar result in a vector (v_out2)
4776         step 2: extract the scalar result (s_out3) from the vector (v_out2)
4777         step 3: adjust the scalar result (s_out3) if needed.
4778
4779         Step 1 can be accomplished using one the following three schemes:
4780           (scheme 1) using reduc_fn, if available.
4781           (scheme 2) using whole-vector shifts, if available.
4782           (scheme 3) using a scalar loop. In this case steps 1+2 above are
4783                      combined.
4784
4785           The overall epilog code looks like this:
4786
4787           s_out0 = phi <s_loop>         # original EXIT_PHI
4788           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
4789           v_out2 = reduce <v_out1>              # step 1
4790           s_out3 = extract_field <v_out2, 0>    # step 2
4791           s_out4 = adjust_result <s_out3>       # step 3
4792
4793           (step 3 is optional, and steps 1 and 2 may be combined).
4794           Lastly, the uses of s_out0 are replaced by s_out4.  */
4795
4796
4797   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4798          v_out1 = phi <VECT_DEF>
4799          Store them in NEW_PHIS.  */
4800
4801   exit_bb = single_exit (loop)->dest;
4802   prev_phi_info = NULL;
4803   new_phis.create (vect_defs.length ());
4804   FOR_EACH_VEC_ELT (vect_defs, i, def)
4805     {
4806       for (j = 0; j < ncopies; j++)
4807         {
4808           tree new_def = copy_ssa_name (def);
4809           phi = create_phi_node (new_def, exit_bb);
4810           stmt_vec_info phi_info = loop_vinfo->add_stmt (phi);
4811           if (j == 0)
4812             new_phis.quick_push (phi);
4813           else
4814             {
4815               def = vect_get_vec_def_for_stmt_copy (loop_vinfo, def);
4816               STMT_VINFO_RELATED_STMT (prev_phi_info) = phi_info;
4817             }
4818
4819           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4820           prev_phi_info = phi_info;
4821         }
4822     }
4823
4824   /* The epilogue is created for the outer-loop, i.e., for the loop being
4825      vectorized.  Create exit phis for the outer loop.  */
4826   if (double_reduc)
4827     {
4828       loop = outer_loop;
4829       exit_bb = single_exit (loop)->dest;
4830       inner_phis.create (vect_defs.length ());
4831       FOR_EACH_VEC_ELT (new_phis, i, phi)
4832         {
4833           stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
4834           tree new_result = copy_ssa_name (PHI_RESULT (phi));
4835           gphi *outer_phi = create_phi_node (new_result, exit_bb);
4836           SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4837                            PHI_RESULT (phi));
4838           prev_phi_info = loop_vinfo->add_stmt (outer_phi);
4839           inner_phis.quick_push (phi_info);
4840           new_phis[i] = outer_phi;
4841           while (STMT_VINFO_RELATED_STMT (phi_info))
4842             {
4843               phi_info = STMT_VINFO_RELATED_STMT (phi_info);
4844               new_result = copy_ssa_name (PHI_RESULT (phi_info->stmt));
4845               outer_phi = create_phi_node (new_result, exit_bb);
4846               SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4847                                PHI_RESULT (phi_info->stmt));
4848               stmt_vec_info outer_phi_info = loop_vinfo->add_stmt (outer_phi);
4849               STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi_info;
4850               prev_phi_info = outer_phi_info;
4851             }
4852         }
4853     }
4854
4855   exit_gsi = gsi_after_labels (exit_bb);
4856
4857   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4858          (i.e. when reduc_fn is not available) and in the final adjustment
4859          code (if needed).  Also get the original scalar reduction variable as
4860          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
4861          represents a reduction pattern), the tree-code and scalar-def are
4862          taken from the original stmt that the pattern-stmt (STMT) replaces.
4863          Otherwise (it is a regular reduction) - the tree-code and scalar-def
4864          are taken from STMT.  */
4865
4866   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4867   if (orig_stmt_info != stmt_info)
4868     {
4869       /* Reduction pattern  */
4870       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4871       gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
4872     }
4873
4874   code = gimple_assign_rhs_code (orig_stmt_info->stmt);
4875   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4876      partial results are added and not subtracted.  */
4877   if (code == MINUS_EXPR)
4878     code = PLUS_EXPR;
4879
4880   scalar_dest = gimple_assign_lhs (orig_stmt_info->stmt);
4881   scalar_type = TREE_TYPE (scalar_dest);
4882   scalar_results.create (group_size);
4883   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4884   bitsize = TYPE_SIZE (scalar_type);
4885
4886   /* In case this is a reduction in an inner-loop while vectorizing an outer
4887      loop - we don't need to extract a single scalar result at the end of the
4888      inner-loop (unless it is double reduction, i.e., the use of reduction is
4889      outside the outer-loop).  The final vector of partial results will be used
4890      in the vectorized outer-loop, or reduced to a scalar result at the end of
4891      the outer-loop.  */
4892   if (nested_in_vect_loop && !double_reduc)
4893     goto vect_finalize_reduction;
4894
4895   /* SLP reduction without reduction chain, e.g.,
4896      # a1 = phi <a2, a0>
4897      # b1 = phi <b2, b0>
4898      a2 = operation (a1)
4899      b2 = operation (b1)  */
4900   slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
4901
4902   /* True if we should implement SLP_REDUC using native reduction operations
4903      instead of scalar operations.  */
4904   direct_slp_reduc = (reduc_fn != IFN_LAST
4905                       && slp_reduc
4906                       && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
4907
4908   /* In case of reduction chain, e.g.,
4909      # a1 = phi <a3, a0>
4910      a2 = operation (a1)
4911      a3 = operation (a2),
4912
4913      we may end up with more than one vector result.  Here we reduce them to
4914      one vector.  */
4915   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) || direct_slp_reduc)
4916     {
4917       tree first_vect = PHI_RESULT (new_phis[0]);
4918       gassign *new_vec_stmt = NULL;
4919       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4920       for (k = 1; k < new_phis.length (); k++)
4921         {
4922           gimple *next_phi = new_phis[k];
4923           tree second_vect = PHI_RESULT (next_phi);
4924           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4925           new_vec_stmt = gimple_build_assign (tem, code,
4926                                               first_vect, second_vect);
4927           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4928           first_vect = tem;
4929         }
4930
4931       new_phi_result = first_vect;
4932       if (new_vec_stmt)
4933         {
4934           new_phis.truncate (0);
4935           new_phis.safe_push (new_vec_stmt);
4936         }
4937     }
4938   /* Likewise if we couldn't use a single defuse cycle.  */
4939   else if (ncopies > 1)
4940     {
4941       gcc_assert (new_phis.length () == 1);
4942       tree first_vect = PHI_RESULT (new_phis[0]);
4943       gassign *new_vec_stmt = NULL;
4944       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4945       stmt_vec_info next_phi_info = loop_vinfo->lookup_stmt (new_phis[0]);
4946       for (int k = 1; k < ncopies; ++k)
4947         {
4948           next_phi_info = STMT_VINFO_RELATED_STMT (next_phi_info);
4949           tree second_vect = PHI_RESULT (next_phi_info->stmt);
4950           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4951           new_vec_stmt = gimple_build_assign (tem, code,
4952                                               first_vect, second_vect);
4953           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4954           first_vect = tem;
4955         }
4956       new_phi_result = first_vect;
4957       new_phis.truncate (0);
4958       new_phis.safe_push (new_vec_stmt);
4959     }
4960   else
4961     new_phi_result = PHI_RESULT (new_phis[0]);
4962
4963   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4964       && reduc_fn != IFN_LAST)
4965     {
4966       /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4967          various data values where the condition matched and another vector
4968          (INDUCTION_INDEX) containing all the indexes of those matches.  We
4969          need to extract the last matching index (which will be the index with
4970          highest value) and use this to index into the data vector.
4971          For the case where there were no matches, the data vector will contain
4972          all default values and the index vector will be all zeros.  */
4973
4974       /* Get various versions of the type of the vector of indexes.  */
4975       tree index_vec_type = TREE_TYPE (induction_index);
4976       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4977       tree index_scalar_type = TREE_TYPE (index_vec_type);
4978       tree index_vec_cmp_type = build_same_sized_truth_vector_type
4979         (index_vec_type);
4980
4981       /* Get an unsigned integer version of the type of the data vector.  */
4982       int scalar_precision
4983         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
4984       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4985       tree vectype_unsigned = build_vector_type
4986         (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4987
4988       /* First we need to create a vector (ZERO_VEC) of zeros and another
4989          vector (MAX_INDEX_VEC) filled with the last matching index, which we
4990          can create using a MAX reduction and then expanding.
4991          In the case where the loop never made any matches, the max index will
4992          be zero.  */
4993
4994       /* Vector of {0, 0, 0,...}.  */
4995       tree zero_vec = make_ssa_name (vectype);
4996       tree zero_vec_rhs = build_zero_cst (vectype);
4997       gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
4998       gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
4999
5000       /* Find maximum value from the vector of found indexes.  */
5001       tree max_index = make_ssa_name (index_scalar_type);
5002       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5003                                                           1, induction_index);
5004       gimple_call_set_lhs (max_index_stmt, max_index);
5005       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
5006
5007       /* Vector of {max_index, max_index, max_index,...}.  */
5008       tree max_index_vec = make_ssa_name (index_vec_type);
5009       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
5010                                                       max_index);
5011       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
5012                                                         max_index_vec_rhs);
5013       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
5014
5015       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
5016          with the vector (INDUCTION_INDEX) of found indexes, choosing values
5017          from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
5018          otherwise.  Only one value should match, resulting in a vector
5019          (VEC_COND) with one data value and the rest zeros.
5020          In the case where the loop never made any matches, every index will
5021          match, resulting in a vector with all data values (which will all be
5022          the default value).  */
5023
5024       /* Compare the max index vector to the vector of found indexes to find
5025          the position of the max value.  */
5026       tree vec_compare = make_ssa_name (index_vec_cmp_type);
5027       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
5028                                                       induction_index,
5029                                                       max_index_vec);
5030       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
5031
5032       /* Use the compare to choose either values from the data vector or
5033          zero.  */
5034       tree vec_cond = make_ssa_name (vectype);
5035       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
5036                                                    vec_compare, new_phi_result,
5037                                                    zero_vec);
5038       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
5039
5040       /* Finally we need to extract the data value from the vector (VEC_COND)
5041          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
5042          reduction, but because this doesn't exist, we can use a MAX reduction
5043          instead.  The data value might be signed or a float so we need to cast
5044          it first.
5045          In the case where the loop never made any matches, the data values are
5046          all identical, and so will reduce down correctly.  */
5047
5048       /* Make the matched data values unsigned.  */
5049       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
5050       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
5051                                        vec_cond);
5052       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
5053                                                         VIEW_CONVERT_EXPR,
5054                                                         vec_cond_cast_rhs);
5055       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
5056
5057       /* Reduce down to a scalar value.  */
5058       tree data_reduc = make_ssa_name (scalar_type_unsigned);
5059       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5060                                                            1, vec_cond_cast);
5061       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
5062       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
5063
5064       /* Convert the reduced value back to the result type and set as the
5065          result.  */
5066       gimple_seq stmts = NULL;
5067       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
5068                                data_reduc);
5069       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5070       scalar_results.safe_push (new_temp);
5071     }
5072   else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
5073            && reduc_fn == IFN_LAST)
5074     {
5075       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
5076          idx = 0;
5077          idx_val = induction_index[0];
5078          val = data_reduc[0];
5079          for (idx = 0, val = init, i = 0; i < nelts; ++i)
5080            if (induction_index[i] > idx_val)
5081              val = data_reduc[i], idx_val = induction_index[i];
5082          return val;  */
5083
5084       tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
5085       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5086       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5087       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5088       /* Enforced by vectorizable_reduction, which ensures we have target
5089          support before allowing a conditional reduction on variable-length
5090          vectors.  */
5091       unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5092       tree idx_val = NULL_TREE, val = NULL_TREE;
5093       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5094         {
5095           tree old_idx_val = idx_val;
5096           tree old_val = val;
5097           idx_val = make_ssa_name (idx_eltype);
5098           epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5099                                              build3 (BIT_FIELD_REF, idx_eltype,
5100                                                      induction_index,
5101                                                      bitsize_int (el_size),
5102                                                      bitsize_int (off)));
5103           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5104           val = make_ssa_name (data_eltype);
5105           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5106                                              build3 (BIT_FIELD_REF,
5107                                                      data_eltype,
5108                                                      new_phi_result,
5109                                                      bitsize_int (el_size),
5110                                                      bitsize_int (off)));
5111           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5112           if (off != 0)
5113             {
5114               tree new_idx_val = idx_val;
5115               if (off != v_size - el_size)
5116                 {
5117                   new_idx_val = make_ssa_name (idx_eltype);
5118                   epilog_stmt = gimple_build_assign (new_idx_val,
5119                                                      MAX_EXPR, idx_val,
5120                                                      old_idx_val);
5121                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5122                 }
5123               tree new_val = make_ssa_name (data_eltype);
5124               epilog_stmt = gimple_build_assign (new_val,
5125                                                  COND_EXPR,
5126                                                  build2 (GT_EXPR,
5127                                                          boolean_type_node,
5128                                                          idx_val,
5129                                                          old_idx_val),
5130                                                  val, old_val);
5131               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5132               idx_val = new_idx_val;
5133               val = new_val;
5134             }
5135         }
5136       /* Convert the reduced value back to the result type and set as the
5137          result.  */
5138       gimple_seq stmts = NULL;
5139       val = gimple_convert (&stmts, scalar_type, val);
5140       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5141       scalar_results.safe_push (val);
5142     }
5143
5144   /* 2.3 Create the reduction code, using one of the three schemes described
5145          above. In SLP we simply need to extract all the elements from the
5146          vector (without reducing them), so we use scalar shifts.  */
5147   else if (reduc_fn != IFN_LAST && !slp_reduc)
5148     {
5149       tree tmp;
5150       tree vec_elem_type;
5151
5152       /* Case 1:  Create:
5153          v_out2 = reduc_expr <v_out1>  */
5154
5155       if (dump_enabled_p ())
5156         dump_printf_loc (MSG_NOTE, vect_location,
5157                          "Reduce using direct vector reduction.\n");
5158
5159       vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
5160       if (!useless_type_conversion_p (scalar_type, vec_elem_type))
5161         {
5162           tree tmp_dest
5163             = vect_create_destination_var (scalar_dest, vec_elem_type);
5164           epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5165                                                     new_phi_result);
5166           gimple_set_lhs (epilog_stmt, tmp_dest);
5167           new_temp = make_ssa_name (tmp_dest, epilog_stmt);
5168           gimple_set_lhs (epilog_stmt, new_temp);
5169           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5170
5171           epilog_stmt = gimple_build_assign (new_scalar_dest, NOP_EXPR,
5172                                              new_temp);
5173         }
5174       else
5175         {
5176           epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5177                                                     new_phi_result);
5178           gimple_set_lhs (epilog_stmt, new_scalar_dest);
5179         }
5180
5181       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5182       gimple_set_lhs (epilog_stmt, new_temp);
5183       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5184
5185       if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5186            == INTEGER_INDUC_COND_REDUCTION)
5187           && !operand_equal_p (initial_def, induc_val, 0))
5188         {
5189           /* Earlier we set the initial value to be a vector if induc_val
5190              values.  Check the result and if it is induc_val then replace
5191              with the original initial value, unless induc_val is
5192              the same as initial_def already.  */
5193           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5194                                   induc_val);
5195
5196           tmp = make_ssa_name (new_scalar_dest);
5197           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5198                                              initial_def, new_temp);
5199           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5200           new_temp = tmp;
5201         }
5202
5203       scalar_results.safe_push (new_temp);
5204     }
5205   else if (direct_slp_reduc)
5206     {
5207       /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5208          with the elements for other SLP statements replaced with the
5209          neutral value.  We can then do a normal reduction on each vector.  */
5210
5211       /* Enforced by vectorizable_reduction.  */
5212       gcc_assert (new_phis.length () == 1);
5213       gcc_assert (pow2p_hwi (group_size));
5214
5215       slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
5216       vec<stmt_vec_info> orig_phis
5217         = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
5218       gimple_seq seq = NULL;
5219
5220       /* Build a vector {0, 1, 2, ...}, with the same number of elements
5221          and the same element size as VECTYPE.  */
5222       tree index = build_index_vector (vectype, 0, 1);
5223       tree index_type = TREE_TYPE (index);
5224       tree index_elt_type = TREE_TYPE (index_type);
5225       tree mask_type = build_same_sized_truth_vector_type (index_type);
5226
5227       /* Create a vector that, for each element, identifies which of
5228          the REDUC_GROUP_SIZE results should use it.  */
5229       tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5230       index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5231                             build_vector_from_val (index_type, index_mask));
5232
5233       /* Get a neutral vector value.  This is simply a splat of the neutral
5234          scalar value if we have one, otherwise the initial scalar value
5235          is itself a neutral value.  */
5236       tree vector_identity = NULL_TREE;
5237       if (neutral_op)
5238         vector_identity = gimple_build_vector_from_val (&seq, vectype,
5239                                                         neutral_op);
5240       for (unsigned int i = 0; i < group_size; ++i)
5241         {
5242           /* If there's no univeral neutral value, we can use the
5243              initial scalar value from the original PHI.  This is used
5244              for MIN and MAX reduction, for example.  */
5245           if (!neutral_op)
5246             {
5247               tree scalar_value
5248                 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i]->stmt,
5249                                          loop_preheader_edge (loop));
5250               vector_identity = gimple_build_vector_from_val (&seq, vectype,
5251                                                               scalar_value);
5252             }
5253
5254           /* Calculate the equivalent of:
5255
5256              sel[j] = (index[j] == i);
5257
5258              which selects the elements of NEW_PHI_RESULT that should
5259              be included in the result.  */
5260           tree compare_val = build_int_cst (index_elt_type, i);
5261           compare_val = build_vector_from_val (index_type, compare_val);
5262           tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5263                                    index, compare_val);
5264
5265           /* Calculate the equivalent of:
5266
5267              vec = seq ? new_phi_result : vector_identity;
5268
5269              VEC is now suitable for a full vector reduction.  */
5270           tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5271                                    sel, new_phi_result, vector_identity);
5272
5273           /* Do the reduction and convert it to the appropriate type.  */
5274           tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5275                                       TREE_TYPE (vectype), vec);
5276           scalar = gimple_convert (&seq, scalar_type, scalar);
5277           scalar_results.safe_push (scalar);
5278         }
5279       gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5280     }
5281   else
5282     {
5283       bool reduce_with_shift;
5284       tree vec_temp;
5285
5286       /* COND reductions all do the final reduction with MAX_EXPR
5287          or MIN_EXPR.  */
5288       if (code == COND_EXPR)
5289         {
5290           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5291               == INTEGER_INDUC_COND_REDUCTION)
5292             code = induc_code;
5293           else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5294                    == CONST_COND_REDUCTION)
5295             code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
5296           else
5297             code = MAX_EXPR;
5298         }
5299
5300       /* See if the target wants to do the final (shift) reduction
5301          in a vector mode of smaller size and first reduce upper/lower
5302          halves against each other.  */
5303       enum machine_mode mode1 = mode;
5304       unsigned sz = tree_to_uhwi (TYPE_SIZE_UNIT (vectype));
5305       unsigned sz1 = sz;
5306       if (!slp_reduc
5307           && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5308         sz1 = GET_MODE_SIZE (mode1).to_constant ();
5309
5310       tree vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz1);
5311       reduce_with_shift = have_whole_vector_shift (mode1);
5312       if (!VECTOR_MODE_P (mode1))
5313         reduce_with_shift = false;
5314       else
5315         {
5316           optab optab = optab_for_tree_code (code, vectype1, optab_default);
5317           if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5318             reduce_with_shift = false;
5319         }
5320
5321       /* First reduce the vector to the desired vector size we should
5322          do shift reduction on by combining upper and lower halves.  */
5323       new_temp = new_phi_result;
5324       while (sz > sz1)
5325         {
5326           gcc_assert (!slp_reduc);
5327           sz /= 2;
5328           vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz);
5329
5330           /* The target has to make sure we support lowpart/highpart
5331              extraction, either via direct vector extract or through
5332              an integer mode punning.  */
5333           tree dst1, dst2;
5334           if (convert_optab_handler (vec_extract_optab,
5335                                      TYPE_MODE (TREE_TYPE (new_temp)),
5336                                      TYPE_MODE (vectype1))
5337               != CODE_FOR_nothing)
5338             {
5339               /* Extract sub-vectors directly once vec_extract becomes
5340                  a conversion optab.  */
5341               dst1 = make_ssa_name (vectype1);
5342               epilog_stmt
5343                   = gimple_build_assign (dst1, BIT_FIELD_REF,
5344                                          build3 (BIT_FIELD_REF, vectype1,
5345                                                  new_temp, TYPE_SIZE (vectype1),
5346                                                  bitsize_int (0)));
5347               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5348               dst2 =  make_ssa_name (vectype1);
5349               epilog_stmt
5350                   = gimple_build_assign (dst2, BIT_FIELD_REF,
5351                                          build3 (BIT_FIELD_REF, vectype1,
5352                                                  new_temp, TYPE_SIZE (vectype1),
5353                                                  bitsize_int (sz * BITS_PER_UNIT)));
5354               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5355             }
5356           else
5357             {
5358               /* Extract via punning to appropriately sized integer mode
5359                  vector.  */
5360               tree eltype = build_nonstandard_integer_type (sz * BITS_PER_UNIT,
5361                                                             1);
5362               tree etype = build_vector_type (eltype, 2);
5363               gcc_assert (convert_optab_handler (vec_extract_optab,
5364                                                  TYPE_MODE (etype),
5365                                                  TYPE_MODE (eltype))
5366                           != CODE_FOR_nothing);
5367               tree tem = make_ssa_name (etype);
5368               epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5369                                                  build1 (VIEW_CONVERT_EXPR,
5370                                                          etype, new_temp));
5371               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5372               new_temp = tem;
5373               tem = make_ssa_name (eltype);
5374               epilog_stmt
5375                   = gimple_build_assign (tem, BIT_FIELD_REF,
5376                                          build3 (BIT_FIELD_REF, eltype,
5377                                                  new_temp, TYPE_SIZE (eltype),
5378                                                  bitsize_int (0)));
5379               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5380               dst1 = make_ssa_name (vectype1);
5381               epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5382                                                  build1 (VIEW_CONVERT_EXPR,
5383                                                          vectype1, tem));
5384               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5385               tem = make_ssa_name (eltype);
5386               epilog_stmt
5387                   = gimple_build_assign (tem, BIT_FIELD_REF,
5388                                          build3 (BIT_FIELD_REF, eltype,
5389                                                  new_temp, TYPE_SIZE (eltype),
5390                                                  bitsize_int (sz * BITS_PER_UNIT)));
5391               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5392               dst2 =  make_ssa_name (vectype1);
5393               epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5394                                                  build1 (VIEW_CONVERT_EXPR,
5395                                                          vectype1, tem));
5396               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5397             }
5398
5399           new_temp = make_ssa_name (vectype1);
5400           epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5401           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5402         }
5403
5404       if (reduce_with_shift && !slp_reduc)
5405         {
5406           int element_bitsize = tree_to_uhwi (bitsize);
5407           /* Enforced by vectorizable_reduction, which disallows SLP reductions
5408              for variable-length vectors and also requires direct target support
5409              for loop reductions.  */
5410           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5411           int nelements = vec_size_in_bits / element_bitsize;
5412           vec_perm_builder sel;
5413           vec_perm_indices indices;
5414
5415           int elt_offset;
5416
5417           tree zero_vec = build_zero_cst (vectype1);
5418           /* Case 2: Create:
5419              for (offset = nelements/2; offset >= 1; offset/=2)
5420                 {
5421                   Create:  va' = vec_shift <va, offset>
5422                   Create:  va = vop <va, va'>
5423                 }  */
5424
5425           tree rhs;
5426
5427           if (dump_enabled_p ())
5428             dump_printf_loc (MSG_NOTE, vect_location,
5429                              "Reduce using vector shifts\n");
5430
5431           vec_dest = vect_create_destination_var (scalar_dest, vectype1);
5432           for (elt_offset = nelements / 2;
5433                elt_offset >= 1;
5434                elt_offset /= 2)
5435             {
5436               calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5437               indices.new_vector (sel, 2, nelements);
5438               tree mask = vect_gen_perm_mask_any (vectype1, indices);
5439               epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
5440                                                  new_temp, zero_vec, mask);
5441               new_name = make_ssa_name (vec_dest, epilog_stmt);
5442               gimple_assign_set_lhs (epilog_stmt, new_name);
5443               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5444
5445               epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
5446                                                  new_temp);
5447               new_temp = make_ssa_name (vec_dest, epilog_stmt);
5448               gimple_assign_set_lhs (epilog_stmt, new_temp);
5449               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5450             }
5451
5452           /* 2.4  Extract the final scalar result.  Create:
5453              s_out3 = extract_field <v_out2, bitpos>  */
5454
5455           if (dump_enabled_p ())
5456             dump_printf_loc (MSG_NOTE, vect_location,
5457                              "extract scalar result\n");
5458
5459           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5460                         bitsize, bitsize_zero_node);
5461           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5462           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5463           gimple_assign_set_lhs (epilog_stmt, new_temp);
5464           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5465           scalar_results.safe_push (new_temp);
5466         }
5467       else
5468         {
5469           /* Case 3: Create:
5470              s = extract_field <v_out2, 0>
5471              for (offset = element_size;
5472                   offset < vector_size;
5473                   offset += element_size;)
5474                {
5475                  Create:  s' = extract_field <v_out2, offset>
5476                  Create:  s = op <s, s'>  // For non SLP cases
5477                }  */
5478
5479           if (dump_enabled_p ())
5480             dump_printf_loc (MSG_NOTE, vect_location,
5481                              "Reduce using scalar code.\n");
5482
5483           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5484           int element_bitsize = tree_to_uhwi (bitsize);
5485           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5486             {
5487               int bit_offset;
5488               if (gimple_code (new_phi) == GIMPLE_PHI)
5489                 vec_temp = PHI_RESULT (new_phi);
5490               else
5491                 vec_temp = gimple_assign_lhs (new_phi);
5492               tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
5493                                  bitsize_zero_node);
5494               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5495               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5496               gimple_assign_set_lhs (epilog_stmt, new_temp);
5497               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5498
5499               /* In SLP we don't need to apply reduction operation, so we just
5500                  collect s' values in SCALAR_RESULTS.  */
5501               if (slp_reduc)
5502                 scalar_results.safe_push (new_temp);
5503
5504               for (bit_offset = element_bitsize;
5505                    bit_offset < vec_size_in_bits;
5506                    bit_offset += element_bitsize)
5507                 {
5508                   tree bitpos = bitsize_int (bit_offset);
5509                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
5510                                      bitsize, bitpos);
5511
5512                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5513                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
5514                   gimple_assign_set_lhs (epilog_stmt, new_name);
5515                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5516
5517                   if (slp_reduc)
5518                     {
5519                       /* In SLP we don't need to apply reduction operation, so
5520                          we just collect s' values in SCALAR_RESULTS.  */
5521                       new_temp = new_name;
5522                       scalar_results.safe_push (new_name);
5523                     }
5524                   else
5525                     {
5526                       epilog_stmt = gimple_build_assign (new_scalar_dest, code,
5527                                                          new_name, new_temp);
5528                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5529                       gimple_assign_set_lhs (epilog_stmt, new_temp);
5530                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5531                     }
5532                 }
5533             }
5534
5535           /* The only case where we need to reduce scalar results in SLP, is
5536              unrolling.  If the size of SCALAR_RESULTS is greater than
5537              REDUC_GROUP_SIZE, we reduce them combining elements modulo
5538              REDUC_GROUP_SIZE.  */
5539           if (slp_reduc)
5540             {
5541               tree res, first_res, new_res;
5542               gimple *new_stmt;
5543
5544               /* Reduce multiple scalar results in case of SLP unrolling.  */
5545               for (j = group_size; scalar_results.iterate (j, &res);
5546                    j++)
5547                 {
5548                   first_res = scalar_results[j % group_size];
5549                   new_stmt = gimple_build_assign (new_scalar_dest, code,
5550                                                   first_res, res);
5551                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
5552                   gimple_assign_set_lhs (new_stmt, new_res);
5553                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
5554                   scalar_results[j % group_size] = new_res;
5555                 }
5556             }
5557           else
5558             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
5559             scalar_results.safe_push (new_temp);
5560         }
5561
5562       if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5563            == INTEGER_INDUC_COND_REDUCTION)
5564           && !operand_equal_p (initial_def, induc_val, 0))
5565         {
5566           /* Earlier we set the initial value to be a vector if induc_val
5567              values.  Check the result and if it is induc_val then replace
5568              with the original initial value, unless induc_val is
5569              the same as initial_def already.  */
5570           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5571                                   induc_val);
5572
5573           tree tmp = make_ssa_name (new_scalar_dest);
5574           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5575                                              initial_def, new_temp);
5576           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5577           scalar_results[0] = tmp;
5578         }
5579     }
5580
5581 vect_finalize_reduction:
5582
5583   if (double_reduc)
5584     loop = loop->inner;
5585
5586   /* 2.5 Adjust the final result by the initial value of the reduction
5587          variable. (When such adjustment is not needed, then
5588          'adjustment_def' is zero).  For example, if code is PLUS we create:
5589          new_temp = loop_exit_def + adjustment_def  */
5590
5591   if (adjustment_def)
5592     {
5593       gcc_assert (!slp_reduc);
5594       if (nested_in_vect_loop)
5595         {
5596           new_phi = new_phis[0];
5597           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
5598           expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
5599           new_dest = vect_create_destination_var (scalar_dest, vectype);
5600         }
5601       else
5602         {
5603           new_temp = scalar_results[0];
5604           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5605           expr = build2 (code, scalar_type, new_temp, adjustment_def);
5606           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
5607         }
5608
5609       epilog_stmt = gimple_build_assign (new_dest, expr);
5610       new_temp = make_ssa_name (new_dest, epilog_stmt);
5611       gimple_assign_set_lhs (epilog_stmt, new_temp);
5612       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5613       if (nested_in_vect_loop)
5614         {
5615           stmt_vec_info epilog_stmt_info = loop_vinfo->add_stmt (epilog_stmt);
5616           STMT_VINFO_RELATED_STMT (epilog_stmt_info)
5617             = STMT_VINFO_RELATED_STMT (loop_vinfo->lookup_stmt (new_phi));
5618
5619           if (!double_reduc)
5620             scalar_results.quick_push (new_temp);
5621           else
5622             scalar_results[0] = new_temp;
5623         }
5624       else
5625         scalar_results[0] = new_temp;
5626
5627       new_phis[0] = epilog_stmt;
5628     }
5629
5630   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
5631           phis with new adjusted scalar results, i.e., replace use <s_out0>
5632           with use <s_out4>.
5633
5634      Transform:
5635         loop_exit:
5636           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5637           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5638           v_out2 = reduce <v_out1>
5639           s_out3 = extract_field <v_out2, 0>
5640           s_out4 = adjust_result <s_out3>
5641           use <s_out0>
5642           use <s_out0>
5643
5644      into:
5645
5646         loop_exit:
5647           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5648           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5649           v_out2 = reduce <v_out1>
5650           s_out3 = extract_field <v_out2, 0>
5651           s_out4 = adjust_result <s_out3>
5652           use <s_out4>
5653           use <s_out4> */
5654
5655
5656   /* In SLP reduction chain we reduce vector results into one vector if
5657      necessary, hence we set here REDUC_GROUP_SIZE to 1.  SCALAR_DEST is the
5658      LHS of the last stmt in the reduction chain, since we are looking for
5659      the loop exit phi node.  */
5660   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5661     {
5662       stmt_vec_info dest_stmt_info
5663         = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
5664       scalar_dest = gimple_assign_lhs (dest_stmt_info->stmt);
5665       group_size = 1;
5666     }
5667
5668   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5669      case that REDUC_GROUP_SIZE is greater than vectorization factor).
5670      Therefore, we need to match SCALAR_RESULTS with corresponding statements.
5671      The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results
5672      correspond to the first vector stmt, etc.
5673      (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)).  */
5674   if (group_size > new_phis.length ())
5675     {
5676       ratio = group_size / new_phis.length ();
5677       gcc_assert (!(group_size % new_phis.length ()));
5678     }
5679   else
5680     ratio = 1;
5681
5682   stmt_vec_info epilog_stmt_info = NULL;
5683   for (k = 0; k < group_size; k++)
5684     {
5685       if (k % ratio == 0)
5686         {
5687           epilog_stmt_info = loop_vinfo->lookup_stmt (new_phis[k / ratio]);
5688           reduction_phi_info = reduction_phis[k / ratio];
5689           if (double_reduc)
5690             inner_phi = inner_phis[k / ratio];
5691         }
5692
5693       if (slp_reduc)
5694         {
5695           stmt_vec_info scalar_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5696
5697           orig_stmt_info = STMT_VINFO_RELATED_STMT (scalar_stmt_info);
5698           /* SLP statements can't participate in patterns.  */
5699           gcc_assert (!orig_stmt_info);
5700           scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt);
5701         }
5702
5703       phis.create (3);
5704       /* Find the loop-closed-use at the loop exit of the original scalar
5705          result.  (The reduction result is expected to have two immediate uses -
5706          one at the latch block, and one at the loop exit).  */
5707       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5708         if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
5709             && !is_gimple_debug (USE_STMT (use_p)))
5710           phis.safe_push (USE_STMT (use_p));
5711
5712       /* While we expect to have found an exit_phi because of loop-closed-ssa
5713          form we can end up without one if the scalar cycle is dead.  */
5714
5715       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5716         {
5717           if (outer_loop)
5718             {
5719               stmt_vec_info exit_phi_vinfo
5720                 = loop_vinfo->lookup_stmt (exit_phi);
5721               gphi *vect_phi;
5722
5723               if (double_reduc)
5724                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
5725               else
5726                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt_info;
5727               if (!double_reduc
5728                   || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
5729                       != vect_double_reduction_def)
5730                 continue;
5731
5732               /* Handle double reduction:
5733
5734                  stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
5735                  stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
5736                  stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
5737                  stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
5738
5739                  At that point the regular reduction (stmt2 and stmt3) is
5740                  already vectorized, as well as the exit phi node, stmt4.
5741                  Here we vectorize the phi node of double reduction, stmt1, and
5742                  update all relevant statements.  */
5743
5744               /* Go through all the uses of s2 to find double reduction phi
5745                  node, i.e., stmt1 above.  */
5746               orig_name = PHI_RESULT (exit_phi);
5747               FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5748                 {
5749                   stmt_vec_info use_stmt_vinfo;
5750                   tree vect_phi_init, preheader_arg, vect_phi_res;
5751                   basic_block bb = gimple_bb (use_stmt);
5752
5753                   /* Check that USE_STMT is really double reduction phi
5754                      node.  */
5755                   if (gimple_code (use_stmt) != GIMPLE_PHI
5756                       || gimple_phi_num_args (use_stmt) != 2
5757                       || bb->loop_father != outer_loop)
5758                     continue;
5759                   use_stmt_vinfo = loop_vinfo->lookup_stmt (use_stmt);
5760                   if (!use_stmt_vinfo
5761                       || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
5762                           != vect_double_reduction_def)
5763                     continue;
5764
5765                   /* Create vector phi node for double reduction:
5766                      vs1 = phi <vs0, vs2>
5767                      vs1 was created previously in this function by a call to
5768                        vect_get_vec_def_for_operand and is stored in
5769                        vec_initial_def;
5770                      vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
5771                      vs0 is created here.  */
5772
5773                   /* Create vector phi node.  */
5774                   vect_phi = create_phi_node (vec_initial_def, bb);
5775                   loop_vec_info_for_loop (outer_loop)->add_stmt (vect_phi);
5776
5777                   /* Create vs0 - initial def of the double reduction phi.  */
5778                   preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
5779                                              loop_preheader_edge (outer_loop));
5780                   vect_phi_init = get_initial_def_for_reduction
5781                     (stmt_info, preheader_arg, NULL);
5782
5783                   /* Update phi node arguments with vs0 and vs2.  */
5784                   add_phi_arg (vect_phi, vect_phi_init,
5785                                loop_preheader_edge (outer_loop),
5786                                UNKNOWN_LOCATION);
5787                   add_phi_arg (vect_phi, PHI_RESULT (inner_phi->stmt),
5788                                loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
5789                   if (dump_enabled_p ())
5790                     dump_printf_loc (MSG_NOTE, vect_location,
5791                                      "created double reduction phi node: %G",
5792                                      vect_phi);
5793
5794                   vect_phi_res = PHI_RESULT (vect_phi);
5795
5796                   /* Replace the use, i.e., set the correct vs1 in the regular
5797                      reduction phi node.  FORNOW, NCOPIES is always 1, so the
5798                      loop is redundant.  */
5799                   stmt_vec_info use_info = reduction_phi_info;
5800                   for (j = 0; j < ncopies; j++)
5801                     {
5802                       edge pr_edge = loop_preheader_edge (loop);
5803                       SET_PHI_ARG_DEF (as_a <gphi *> (use_info->stmt),
5804                                        pr_edge->dest_idx, vect_phi_res);
5805                       use_info = STMT_VINFO_RELATED_STMT (use_info);
5806                     }
5807                 }
5808             }
5809         }
5810
5811       phis.release ();
5812       if (nested_in_vect_loop)
5813         {
5814           if (double_reduc)
5815             loop = outer_loop;
5816           else
5817             continue;
5818         }
5819
5820       phis.create (3);
5821       /* Find the loop-closed-use at the loop exit of the original scalar
5822          result.  (The reduction result is expected to have two immediate uses,
5823          one at the latch block, and one at the loop exit).  For double
5824          reductions we are looking for exit phis of the outer loop.  */
5825       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5826         {
5827           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5828             {
5829               if (!is_gimple_debug (USE_STMT (use_p)))
5830                 phis.safe_push (USE_STMT (use_p));
5831             }
5832           else
5833             {
5834               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5835                 {
5836                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
5837
5838                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5839                     {
5840                       if (!flow_bb_inside_loop_p (loop,
5841                                              gimple_bb (USE_STMT (phi_use_p)))
5842                           && !is_gimple_debug (USE_STMT (phi_use_p)))
5843                         phis.safe_push (USE_STMT (phi_use_p));
5844                     }
5845                 }
5846             }
5847         }
5848
5849       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5850         {
5851           /* Replace the uses:  */
5852           orig_name = PHI_RESULT (exit_phi);
5853           scalar_result = scalar_results[k];
5854           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5855             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5856               SET_USE (use_p, scalar_result);
5857         }
5858
5859       phis.release ();
5860     }
5861 }
5862
5863 /* Return a vector of type VECTYPE that is equal to the vector select
5864    operation "MASK ? VEC : IDENTITY".  Insert the select statements
5865    before GSI.  */
5866
5867 static tree
5868 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
5869                      tree vec, tree identity)
5870 {
5871   tree cond = make_temp_ssa_name (vectype, NULL, "cond");
5872   gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
5873                                           mask, vec, identity);
5874   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5875   return cond;
5876 }
5877
5878 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
5879    order, starting with LHS.  Insert the extraction statements before GSI and
5880    associate the new scalar SSA names with variable SCALAR_DEST.
5881    Return the SSA name for the result.  */
5882
5883 static tree
5884 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
5885                        tree_code code, tree lhs, tree vector_rhs)
5886 {
5887   tree vectype = TREE_TYPE (vector_rhs);
5888   tree scalar_type = TREE_TYPE (vectype);
5889   tree bitsize = TYPE_SIZE (scalar_type);
5890   unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5891   unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
5892
5893   for (unsigned HOST_WIDE_INT bit_offset = 0;
5894        bit_offset < vec_size_in_bits;
5895        bit_offset += element_bitsize)
5896     {
5897       tree bitpos = bitsize_int (bit_offset);
5898       tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
5899                          bitsize, bitpos);
5900
5901       gassign *stmt = gimple_build_assign (scalar_dest, rhs);
5902       rhs = make_ssa_name (scalar_dest, stmt);
5903       gimple_assign_set_lhs (stmt, rhs);
5904       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5905
5906       stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
5907       tree new_name = make_ssa_name (scalar_dest, stmt);
5908       gimple_assign_set_lhs (stmt, new_name);
5909       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5910       lhs = new_name;
5911     }
5912   return lhs;
5913 }
5914
5915 /* Get a masked internal function equivalent to REDUC_FN.  VECTYPE_IN is the
5916    type of the vector input.  */
5917
5918 static internal_fn
5919 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
5920 {
5921   internal_fn mask_reduc_fn;
5922
5923   switch (reduc_fn)
5924     {
5925     case IFN_FOLD_LEFT_PLUS:
5926       mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
5927       break;
5928
5929     default:
5930       return IFN_LAST;
5931     }
5932
5933   if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
5934                                       OPTIMIZE_FOR_SPEED))
5935     return mask_reduc_fn;
5936   return IFN_LAST;
5937 }
5938
5939 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT_INFO is the
5940    statement that sets the live-out value.  REDUC_DEF_STMT is the phi
5941    statement.  CODE is the operation performed by STMT_INFO and OPS are
5942    its scalar operands.  REDUC_INDEX is the index of the operand in
5943    OPS that is set by REDUC_DEF_STMT.  REDUC_FN is the function that
5944    implements in-order reduction, or IFN_LAST if we should open-code it.
5945    VECTYPE_IN is the type of the vector input.  MASKS specifies the masks
5946    that should be used to control the operation in a fully-masked loop.  */
5947
5948 static bool
5949 vectorize_fold_left_reduction (stmt_vec_info stmt_info,
5950                                gimple_stmt_iterator *gsi,
5951                                stmt_vec_info *vec_stmt, slp_tree slp_node,
5952                                gimple *reduc_def_stmt,
5953                                tree_code code, internal_fn reduc_fn,
5954                                tree ops[3], tree vectype_in,
5955                                int reduc_index, vec_loop_masks *masks)
5956 {
5957   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5958   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5959   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5960   stmt_vec_info new_stmt_info = NULL;
5961   internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
5962
5963   int ncopies;
5964   if (slp_node)
5965     ncopies = 1;
5966   else
5967     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
5968
5969   gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
5970   gcc_assert (ncopies == 1);
5971   gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
5972   gcc_assert (reduc_index == (code == MINUS_EXPR ? 0 : 1));
5973   gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5974               == FOLD_LEFT_REDUCTION);
5975
5976   if (slp_node)
5977     gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
5978                           TYPE_VECTOR_SUBPARTS (vectype_in)));
5979
5980   tree op0 = ops[1 - reduc_index];
5981
5982   int group_size = 1;
5983   stmt_vec_info scalar_dest_def_info;
5984   auto_vec<tree> vec_oprnds0;
5985   if (slp_node)
5986     {
5987       auto_vec<vec<tree> > vec_defs (2);
5988       auto_vec<tree> sops(2);
5989       sops.quick_push (ops[0]);
5990       sops.quick_push (ops[1]);
5991       vect_get_slp_defs (sops, slp_node, &vec_defs);
5992       vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
5993       vec_defs[0].release ();
5994       vec_defs[1].release ();
5995       group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
5996       scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5997     }
5998   else
5999     {
6000       tree loop_vec_def0 = vect_get_vec_def_for_operand (op0, stmt_info);
6001       vec_oprnds0.create (1);
6002       vec_oprnds0.quick_push (loop_vec_def0);
6003       scalar_dest_def_info = stmt_info;
6004     }
6005
6006   tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
6007   tree scalar_type = TREE_TYPE (scalar_dest);
6008   tree reduc_var = gimple_phi_result (reduc_def_stmt);
6009
6010   int vec_num = vec_oprnds0.length ();
6011   gcc_assert (vec_num == 1 || slp_node);
6012   tree vec_elem_type = TREE_TYPE (vectype_out);
6013   gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
6014
6015   tree vector_identity = NULL_TREE;
6016   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6017     vector_identity = build_zero_cst (vectype_out);
6018
6019   tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
6020   int i;
6021   tree def0;
6022   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6023     {
6024       gimple *new_stmt;
6025       tree mask = NULL_TREE;
6026       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6027         mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
6028
6029       /* Handle MINUS by adding the negative.  */
6030       if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
6031         {
6032           tree negated = make_ssa_name (vectype_out);
6033           new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
6034           gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6035           def0 = negated;
6036         }
6037
6038       if (mask && mask_reduc_fn == IFN_LAST)
6039         def0 = merge_with_identity (gsi, mask, vectype_out, def0,
6040                                     vector_identity);
6041
6042       /* On the first iteration the input is simply the scalar phi
6043          result, and for subsequent iterations it is the output of
6044          the preceding operation.  */
6045       if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
6046         {
6047           if (mask && mask_reduc_fn != IFN_LAST)
6048             new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
6049                                                    def0, mask);
6050           else
6051             new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
6052                                                    def0);
6053           /* For chained SLP reductions the output of the previous reduction
6054              operation serves as the input of the next. For the final statement
6055              the output cannot be a temporary - we reuse the original
6056              scalar destination of the last statement.  */
6057           if (i != vec_num - 1)
6058             {
6059               gimple_set_lhs (new_stmt, scalar_dest_var);
6060               reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
6061               gimple_set_lhs (new_stmt, reduc_var);
6062             }
6063         }
6064       else
6065         {
6066           reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
6067                                              reduc_var, def0);
6068           new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6069           /* Remove the statement, so that we can use the same code paths
6070              as for statements that we've just created.  */
6071           gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6072           gsi_remove (&tmp_gsi, true);
6073         }
6074
6075       if (i == vec_num - 1)
6076         {
6077           gimple_set_lhs (new_stmt, scalar_dest);
6078           new_stmt_info = vect_finish_replace_stmt (scalar_dest_def_info,
6079                                                     new_stmt);
6080         }
6081       else
6082         new_stmt_info = vect_finish_stmt_generation (scalar_dest_def_info,
6083                                                      new_stmt, gsi);
6084
6085       if (slp_node)
6086         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
6087     }
6088
6089   if (!slp_node)
6090     STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
6091
6092   return true;
6093 }
6094
6095 /* Function is_nonwrapping_integer_induction.
6096
6097    Check if STMT_VINO (which is part of loop LOOP) both increments and
6098    does not cause overflow.  */
6099
6100 static bool
6101 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, struct loop *loop)
6102 {
6103   gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
6104   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
6105   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
6106   tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
6107   widest_int ni, max_loop_value, lhs_max;
6108   wi::overflow_type overflow = wi::OVF_NONE;
6109
6110   /* Make sure the loop is integer based.  */
6111   if (TREE_CODE (base) != INTEGER_CST
6112       || TREE_CODE (step) != INTEGER_CST)
6113     return false;
6114
6115   /* Check that the max size of the loop will not wrap.  */
6116
6117   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6118     return true;
6119
6120   if (! max_stmt_executions (loop, &ni))
6121     return false;
6122
6123   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6124                             &overflow);
6125   if (overflow)
6126     return false;
6127
6128   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6129                             TYPE_SIGN (lhs_type), &overflow);
6130   if (overflow)
6131     return false;
6132
6133   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6134           <= TYPE_PRECISION (lhs_type));
6135 }
6136
6137 /* Check if masking can be supported by inserting a conditional expression.
6138    CODE is the code for the operation.  COND_FN is the conditional internal
6139    function, if it exists.  VECTYPE_IN is the type of the vector input.  */
6140 static bool
6141 use_mask_by_cond_expr_p (enum tree_code code, internal_fn cond_fn,
6142                          tree vectype_in)
6143 {
6144   if (cond_fn != IFN_LAST
6145       && direct_internal_fn_supported_p (cond_fn, vectype_in,
6146                                          OPTIMIZE_FOR_SPEED))
6147     return false;
6148
6149   switch (code)
6150     {
6151     case DOT_PROD_EXPR:
6152     case SAD_EXPR:
6153       return true;
6154
6155     default:
6156       return false;
6157     }
6158 }
6159
6160 /* Insert a conditional expression to enable masked vectorization.  CODE is the
6161    code for the operation.  VOP is the array of operands.  MASK is the loop
6162    mask.  GSI is a statement iterator used to place the new conditional
6163    expression.  */
6164 static void
6165 build_vect_cond_expr (enum tree_code code, tree vop[3], tree mask,
6166                       gimple_stmt_iterator *gsi)
6167 {
6168   switch (code)
6169     {
6170     case DOT_PROD_EXPR:
6171       {
6172         tree vectype = TREE_TYPE (vop[1]);
6173         tree zero = build_zero_cst (vectype);
6174         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6175         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6176                                                mask, vop[1], zero);
6177         gsi_insert_before (gsi, select, GSI_SAME_STMT);
6178         vop[1] = masked_op1;
6179         break;
6180       }
6181
6182     case SAD_EXPR:
6183       {
6184         tree vectype = TREE_TYPE (vop[1]);
6185         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6186         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6187                                                mask, vop[1], vop[0]);
6188         gsi_insert_before (gsi, select, GSI_SAME_STMT);
6189         vop[1] = masked_op1;
6190         break;
6191       }
6192
6193     default:
6194       gcc_unreachable ();
6195     }
6196 }
6197
6198 /* Function vectorizable_reduction.
6199
6200    Check if STMT_INFO performs a reduction operation that can be vectorized.
6201    If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
6202    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6203    Return true if STMT_INFO is vectorizable in this way.
6204
6205    This function also handles reduction idioms (patterns) that have been
6206    recognized in advance during vect_pattern_recog.  In this case, STMT_INFO
6207    may be of this form:
6208      X = pattern_expr (arg0, arg1, ..., X)
6209    and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
6210    sequence that had been detected and replaced by the pattern-stmt
6211    (STMT_INFO).
6212
6213    This function also handles reduction of condition expressions, for example:
6214      for (int i = 0; i < N; i++)
6215        if (a[i] < value)
6216          last = a[i];
6217    This is handled by vectorising the loop and creating an additional vector
6218    containing the loop indexes for which "a[i] < value" was true.  In the
6219    function epilogue this is reduced to a single max value and then used to
6220    index into the vector of results.
6221
6222    In some cases of reduction patterns, the type of the reduction variable X is
6223    different than the type of the other arguments of STMT_INFO.
6224    In such cases, the vectype that is used when transforming STMT_INFO into
6225    a vector stmt is different than the vectype that is used to determine the
6226    vectorization factor, because it consists of a different number of elements
6227    than the actual number of elements that are being operated upon in parallel.
6228
6229    For example, consider an accumulation of shorts into an int accumulator.
6230    On some targets it's possible to vectorize this pattern operating on 8
6231    shorts at a time (hence, the vectype for purposes of determining the
6232    vectorization factor should be V8HI); on the other hand, the vectype that
6233    is used to create the vector form is actually V4SI (the type of the result).
6234
6235    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6236    indicates what is the actual level of parallelism (V8HI in the example), so
6237    that the right vectorization factor would be derived.  This vectype
6238    corresponds to the type of arguments to the reduction stmt, and should *NOT*
6239    be used to create the vectorized stmt.  The right vectype for the vectorized
6240    stmt is obtained from the type of the result X:
6241         get_vectype_for_scalar_type (TREE_TYPE (X))
6242
6243    This means that, contrary to "regular" reductions (or "regular" stmts in
6244    general), the following equation:
6245       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
6246    does *NOT* necessarily hold for reduction patterns.  */
6247
6248 bool
6249 vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
6250                         stmt_vec_info *vec_stmt, slp_tree slp_node,
6251                         slp_instance slp_node_instance,
6252                         stmt_vector_for_cost *cost_vec)
6253 {
6254   tree vec_dest;
6255   tree scalar_dest;
6256   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6257   tree vectype_in = NULL_TREE;
6258   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6259   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6260   enum tree_code code, orig_code;
6261   internal_fn reduc_fn;
6262   machine_mode vec_mode;
6263   int op_type;
6264   optab optab;
6265   tree new_temp = NULL_TREE;
6266   enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type;
6267   stmt_vec_info cond_stmt_vinfo = NULL;
6268   enum tree_code cond_reduc_op_code = ERROR_MARK;
6269   tree scalar_type;
6270   bool is_simple_use;
6271   int i;
6272   int ncopies;
6273   int epilog_copies;
6274   stmt_vec_info prev_stmt_info, prev_phi_info;
6275   bool single_defuse_cycle = false;
6276   stmt_vec_info new_stmt_info = NULL;
6277   int j;
6278   tree ops[3];
6279   enum vect_def_type dts[3];
6280   bool nested_cycle = false, found_nested_cycle_def = false;
6281   bool double_reduc = false;
6282   basic_block def_bb;
6283   struct loop * def_stmt_loop;
6284   tree def_arg;
6285   auto_vec<tree> vec_oprnds0;
6286   auto_vec<tree> vec_oprnds1;
6287   auto_vec<tree> vec_oprnds2;
6288   auto_vec<tree> vect_defs;
6289   auto_vec<stmt_vec_info> phis;
6290   int vec_num;
6291   tree def0, tem;
6292   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6293   tree cond_reduc_val = NULL_TREE;
6294
6295   /* Make sure it was already recognized as a reduction computation.  */
6296   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
6297       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
6298     return false;
6299
6300   if (nested_in_vect_loop_p (loop, stmt_info))
6301     {
6302       loop = loop->inner;
6303       nested_cycle = true;
6304     }
6305
6306   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6307     gcc_assert (slp_node
6308                 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
6309
6310   if (gphi *phi = dyn_cast <gphi *> (stmt_info->stmt))
6311     {
6312       tree phi_result = gimple_phi_result (phi);
6313       /* Analysis is fully done on the reduction stmt invocation.  */
6314       if (! vec_stmt)
6315         {
6316           if (slp_node)
6317             slp_node_instance->reduc_phis = slp_node;
6318
6319           STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6320           return true;
6321         }
6322
6323       if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6324         /* Leave the scalar phi in place.  Note that checking
6325            STMT_VINFO_VEC_REDUCTION_TYPE (as below) only works
6326            for reductions involving a single statement.  */
6327         return true;
6328
6329       stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
6330       reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
6331
6332       if (STMT_VINFO_VEC_REDUCTION_TYPE (reduc_stmt_info)
6333           == EXTRACT_LAST_REDUCTION)
6334         /* Leave the scalar phi in place.  */
6335         return true;
6336
6337       gassign *reduc_stmt = as_a <gassign *> (reduc_stmt_info->stmt);
6338       code = gimple_assign_rhs_code (reduc_stmt);
6339       for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k)
6340         {
6341           tree op = gimple_op (reduc_stmt, k);
6342           if (op == phi_result)
6343             continue;
6344           if (k == 1 && code == COND_EXPR)
6345             continue;
6346           bool is_simple_use = vect_is_simple_use (op, loop_vinfo, &dt);
6347           gcc_assert (is_simple_use);
6348           if (dt == vect_constant_def || dt == vect_external_def)
6349             continue;
6350           if (!vectype_in
6351               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6352                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (op)))))
6353             vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op));
6354           break;
6355         }
6356       /* For a nested cycle we might end up with an operation like
6357          phi_result * phi_result.  */
6358       if (!vectype_in)
6359         vectype_in = STMT_VINFO_VECTYPE (stmt_info);
6360       gcc_assert (vectype_in);
6361
6362       if (slp_node)
6363         ncopies = 1;
6364       else
6365         ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6366
6367       stmt_vec_info use_stmt_info;
6368       if (ncopies > 1
6369           && STMT_VINFO_RELEVANT (reduc_stmt_info) <= vect_used_only_live
6370           && (use_stmt_info = loop_vinfo->lookup_single_use (phi_result))
6371           && vect_stmt_to_vectorize (use_stmt_info) == reduc_stmt_info)
6372         single_defuse_cycle = true;
6373
6374       /* Create the destination vector  */
6375       scalar_dest = gimple_assign_lhs (reduc_stmt);
6376       vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6377
6378       if (slp_node)
6379         /* The size vect_schedule_slp_instance computes is off for us.  */
6380         vec_num = vect_get_num_vectors
6381           (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6382            * SLP_TREE_SCALAR_STMTS (slp_node).length (),
6383            vectype_in);
6384       else
6385         vec_num = 1;
6386
6387       /* Generate the reduction PHIs upfront.  */
6388       prev_phi_info = NULL;
6389       for (j = 0; j < ncopies; j++)
6390         {
6391           if (j == 0 || !single_defuse_cycle)
6392             {
6393               for (i = 0; i < vec_num; i++)
6394                 {
6395                   /* Create the reduction-phi that defines the reduction
6396                      operand.  */
6397                   gimple *new_phi = create_phi_node (vec_dest, loop->header);
6398                   stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
6399
6400                   if (slp_node)
6401                     SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi_info);
6402                   else
6403                     {
6404                       if (j == 0)
6405                         STMT_VINFO_VEC_STMT (stmt_info)
6406                           = *vec_stmt = new_phi_info;
6407                       else
6408                         STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi_info;
6409                       prev_phi_info = new_phi_info;
6410                     }
6411                 }
6412             }
6413         }
6414
6415       return true;
6416     }
6417
6418   /* 1. Is vectorizable reduction?  */
6419   /* Not supportable if the reduction variable is used in the loop, unless
6420      it's a reduction chain.  */
6421   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6422       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6423     return false;
6424
6425   /* Reductions that are not used even in an enclosing outer-loop,
6426      are expected to be "live" (used out of the loop).  */
6427   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6428       && !STMT_VINFO_LIVE_P (stmt_info))
6429     return false;
6430
6431   /* 2. Has this been recognized as a reduction pattern?
6432
6433      Check if STMT represents a pattern that has been recognized
6434      in earlier analysis stages.  For stmts that represent a pattern,
6435      the STMT_VINFO_RELATED_STMT field records the last stmt in
6436      the original sequence that constitutes the pattern.  */
6437
6438   stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6439   if (orig_stmt_info)
6440     {
6441       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6442       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6443     }
6444
6445   /* 3. Check the operands of the operation.  The first operands are defined
6446         inside the loop body. The last operand is the reduction variable,
6447         which is defined by the loop-header-phi.  */
6448
6449   gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6450
6451   /* Flatten RHS.  */
6452   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
6453     {
6454     case GIMPLE_BINARY_RHS:
6455       code = gimple_assign_rhs_code (stmt);
6456       op_type = TREE_CODE_LENGTH (code);
6457       gcc_assert (op_type == binary_op);
6458       ops[0] = gimple_assign_rhs1 (stmt);
6459       ops[1] = gimple_assign_rhs2 (stmt);
6460       break;
6461
6462     case GIMPLE_TERNARY_RHS:
6463       code = gimple_assign_rhs_code (stmt);
6464       op_type = TREE_CODE_LENGTH (code);
6465       gcc_assert (op_type == ternary_op);
6466       ops[0] = gimple_assign_rhs1 (stmt);
6467       ops[1] = gimple_assign_rhs2 (stmt);
6468       ops[2] = gimple_assign_rhs3 (stmt);
6469       break;
6470
6471     case GIMPLE_UNARY_RHS:
6472       return false;
6473
6474     default:
6475       gcc_unreachable ();
6476     }
6477
6478   if (code == COND_EXPR && slp_node)
6479     return false;
6480
6481   scalar_dest = gimple_assign_lhs (stmt);
6482   scalar_type = TREE_TYPE (scalar_dest);
6483   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6484       && !SCALAR_FLOAT_TYPE_P (scalar_type))
6485     return false;
6486
6487   /* Do not try to vectorize bit-precision reductions.  */
6488   if (!type_has_mode_precision_p (scalar_type))
6489     return false;
6490
6491   /* All uses but the last are expected to be defined in the loop.
6492      The last use is the reduction variable.  In case of nested cycle this
6493      assumption is not true: we use reduc_index to record the index of the
6494      reduction variable.  */
6495   stmt_vec_info reduc_def_info;
6496   if (orig_stmt_info)
6497     reduc_def_info = STMT_VINFO_REDUC_DEF (orig_stmt_info);
6498   else
6499     reduc_def_info = STMT_VINFO_REDUC_DEF (stmt_info);
6500   gcc_assert (reduc_def_info);
6501   gphi *reduc_def_phi = as_a <gphi *> (reduc_def_info->stmt);
6502   tree reduc_def = PHI_RESULT (reduc_def_phi);
6503   int reduc_index = -1;
6504   for (i = 0; i < op_type; i++)
6505     {
6506       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
6507       if (i == 0 && code == COND_EXPR)
6508         continue;
6509
6510       stmt_vec_info def_stmt_info;
6511       is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, &dts[i], &tem,
6512                                           &def_stmt_info);
6513       dt = dts[i];
6514       gcc_assert (is_simple_use);
6515       if (dt == vect_reduction_def
6516           && ops[i] == reduc_def)
6517         {
6518           reduc_index = i;
6519           continue;
6520         }
6521       else if (tem)
6522         {
6523           /* To properly compute ncopies we are interested in the widest
6524              input type in case we're looking at a widening accumulation.  */
6525           if (!vectype_in
6526               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6527                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem)))))
6528             vectype_in = tem;
6529         }
6530
6531       if (dt != vect_internal_def
6532           && dt != vect_external_def
6533           && dt != vect_constant_def
6534           && dt != vect_induction_def
6535           && !(dt == vect_nested_cycle && nested_cycle))
6536         return false;
6537
6538       if (dt == vect_nested_cycle
6539           && ops[i] == reduc_def)
6540         {
6541           found_nested_cycle_def = true;
6542           reduc_index = i;
6543         }
6544
6545       if (i == 1 && code == COND_EXPR)
6546         {
6547           /* Record how value of COND_EXPR is defined.  */
6548           if (dt == vect_constant_def)
6549             {
6550               cond_reduc_dt = dt;
6551               cond_reduc_val = ops[i];
6552             }
6553           if (dt == vect_induction_def
6554               && def_stmt_info
6555               && is_nonwrapping_integer_induction (def_stmt_info, loop))
6556             {
6557               cond_reduc_dt = dt;
6558               cond_stmt_vinfo = def_stmt_info;
6559             }
6560         }
6561     }
6562
6563   if (!vectype_in)
6564     vectype_in = vectype_out;
6565
6566   /* When vectorizing a reduction chain w/o SLP the reduction PHI is not
6567      directy used in stmt.  */
6568   if (reduc_index == -1)
6569     {
6570       if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6571         {
6572           if (dump_enabled_p ())
6573             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6574                              "in-order reduction chain without SLP.\n");
6575           return false;
6576         }
6577     }
6578
6579   if (!(reduc_index == -1
6580         || dts[reduc_index] == vect_reduction_def
6581         || dts[reduc_index] == vect_nested_cycle
6582         || ((dts[reduc_index] == vect_internal_def
6583              || dts[reduc_index] == vect_external_def
6584              || dts[reduc_index] == vect_constant_def
6585              || dts[reduc_index] == vect_induction_def)
6586             && nested_cycle && found_nested_cycle_def)))
6587     {
6588       /* For pattern recognized stmts, orig_stmt might be a reduction,
6589          but some helper statements for the pattern might not, or
6590          might be COND_EXPRs with reduction uses in the condition.  */
6591       gcc_assert (orig_stmt_info);
6592       return false;
6593     }
6594
6595   /* PHIs should not participate in patterns.  */
6596   gcc_assert (!STMT_VINFO_RELATED_STMT (reduc_def_info));
6597   enum vect_reduction_type v_reduc_type
6598     = STMT_VINFO_REDUC_TYPE (reduc_def_info);
6599   stmt_vec_info tmp = STMT_VINFO_REDUC_DEF (reduc_def_info);
6600
6601   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
6602   /* If we have a condition reduction, see if we can simplify it further.  */
6603   if (v_reduc_type == COND_REDUCTION)
6604     {
6605       /* TODO: We can't yet handle reduction chains, since we need to treat
6606          each COND_EXPR in the chain specially, not just the last one.
6607          E.g. for:
6608
6609             x_1 = PHI <x_3, ...>
6610             x_2 = a_2 ? ... : x_1;
6611             x_3 = a_3 ? ... : x_2;
6612
6613          we're interested in the last element in x_3 for which a_2 || a_3
6614          is true, whereas the current reduction chain handling would
6615          vectorize x_2 as a normal VEC_COND_EXPR and only treat x_3
6616          as a reduction operation.  */
6617       if (reduc_index == -1)
6618         {
6619           if (dump_enabled_p ())
6620             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6621                              "conditional reduction chains not supported\n");
6622           return false;
6623         }
6624
6625       /* vect_is_simple_reduction ensured that operand 2 is the
6626          loop-carried operand.  */
6627       gcc_assert (reduc_index == 2);
6628
6629       /* Loop peeling modifies initial value of reduction PHI, which
6630          makes the reduction stmt to be transformed different to the
6631          original stmt analyzed.  We need to record reduction code for
6632          CONST_COND_REDUCTION type reduction at analyzing stage, thus
6633          it can be used directly at transform stage.  */
6634       if (STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MAX_EXPR
6635           || STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MIN_EXPR)
6636         {
6637           /* Also set the reduction type to CONST_COND_REDUCTION.  */
6638           gcc_assert (cond_reduc_dt == vect_constant_def);
6639           STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = CONST_COND_REDUCTION;
6640         }
6641       else if (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6642                                                vectype_in, OPTIMIZE_FOR_SPEED))
6643         {
6644           if (dump_enabled_p ())
6645             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6646                              "optimizing condition reduction with"
6647                              " FOLD_EXTRACT_LAST.\n");
6648           STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = EXTRACT_LAST_REDUCTION;
6649         }
6650       else if (cond_reduc_dt == vect_induction_def)
6651         {
6652           tree base
6653             = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6654           tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6655
6656           gcc_assert (TREE_CODE (base) == INTEGER_CST
6657                       && TREE_CODE (step) == INTEGER_CST);
6658           cond_reduc_val = NULL_TREE;
6659           /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6660              above base; punt if base is the minimum value of the type for
6661              MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
6662           if (tree_int_cst_sgn (step) == -1)
6663             {
6664               cond_reduc_op_code = MIN_EXPR;
6665               if (tree_int_cst_sgn (base) == -1)
6666                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6667               else if (tree_int_cst_lt (base,
6668                                         TYPE_MAX_VALUE (TREE_TYPE (base))))
6669                 cond_reduc_val
6670                   = int_const_binop (PLUS_EXPR, base, integer_one_node);
6671             }
6672           else
6673             {
6674               cond_reduc_op_code = MAX_EXPR;
6675               if (tree_int_cst_sgn (base) == 1)
6676                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6677               else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6678                                         base))
6679                 cond_reduc_val
6680                   = int_const_binop (MINUS_EXPR, base, integer_one_node);
6681             }
6682           if (cond_reduc_val)
6683             {
6684               if (dump_enabled_p ())
6685                 dump_printf_loc (MSG_NOTE, vect_location,
6686                                  "condition expression based on "
6687                                  "integer induction.\n");
6688               STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6689                 = INTEGER_INDUC_COND_REDUCTION;
6690             }
6691         }
6692       else if (cond_reduc_dt == vect_constant_def)
6693         {
6694           enum vect_def_type cond_initial_dt;
6695           gimple *def_stmt = SSA_NAME_DEF_STMT (ops[reduc_index]);
6696           tree cond_initial_val
6697             = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
6698
6699           gcc_assert (cond_reduc_val != NULL_TREE);
6700           vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
6701           if (cond_initial_dt == vect_constant_def
6702               && types_compatible_p (TREE_TYPE (cond_initial_val),
6703                                      TREE_TYPE (cond_reduc_val)))
6704             {
6705               tree e = fold_binary (LE_EXPR, boolean_type_node,
6706                                     cond_initial_val, cond_reduc_val);
6707               if (e && (integer_onep (e) || integer_zerop (e)))
6708                 {
6709                   if (dump_enabled_p ())
6710                     dump_printf_loc (MSG_NOTE, vect_location,
6711                                      "condition expression based on "
6712                                      "compile time constant.\n");
6713                   /* Record reduction code at analysis stage.  */
6714                   STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info)
6715                     = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6716                   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6717                     = CONST_COND_REDUCTION;
6718                 }
6719             }
6720         }
6721     }
6722
6723   if (orig_stmt_info)
6724     gcc_assert (tmp == orig_stmt_info
6725                 || REDUC_GROUP_FIRST_ELEMENT (tmp) == orig_stmt_info);
6726   else
6727     /* We changed STMT to be the first stmt in reduction chain, hence we
6728        check that in this case the first element in the chain is STMT.  */
6729     gcc_assert (tmp == stmt_info
6730                 || REDUC_GROUP_FIRST_ELEMENT (tmp) == stmt_info);
6731
6732   if (STMT_VINFO_LIVE_P (reduc_def_info))
6733     return false;
6734
6735   if (slp_node)
6736     ncopies = 1;
6737   else
6738     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6739
6740   gcc_assert (ncopies >= 1);
6741
6742   vec_mode = TYPE_MODE (vectype_in);
6743   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6744
6745   if (nested_cycle)
6746     {
6747       def_bb = gimple_bb (reduc_def_phi);
6748       def_stmt_loop = def_bb->loop_father;
6749       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
6750                                        loop_preheader_edge (def_stmt_loop));
6751       stmt_vec_info def_arg_stmt_info = loop_vinfo->lookup_def (def_arg);
6752       if (def_arg_stmt_info
6753           && (STMT_VINFO_DEF_TYPE (def_arg_stmt_info)
6754               == vect_double_reduction_def))
6755         double_reduc = true;
6756     }
6757
6758   vect_reduction_type reduction_type
6759     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
6760   if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
6761       && ncopies > 1)
6762     {
6763       if (dump_enabled_p ())
6764         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6765                          "multiple types in double reduction or condition "
6766                          "reduction.\n");
6767       return false;
6768     }
6769
6770   if (code == COND_EXPR)
6771     {
6772       /* Only call during the analysis stage, otherwise we'll lose
6773          STMT_VINFO_TYPE.  */
6774       if (!vec_stmt && !vectorizable_condition (stmt_info, gsi, NULL,
6775                                                 true, NULL, cost_vec))
6776         {
6777           if (dump_enabled_p ())
6778             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6779                              "unsupported condition in reduction\n");
6780           return false;
6781         }
6782     }
6783   else if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
6784            || code == LROTATE_EXPR || code == RROTATE_EXPR)
6785     {
6786       /* Only call during the analysis stage, otherwise we'll lose
6787          STMT_VINFO_TYPE.  We only support this for nested cycles
6788          without double reductions at the moment.  */
6789       if (!nested_cycle
6790           || double_reduc
6791           || (!vec_stmt && !vectorizable_shift (stmt_info, gsi, NULL,
6792                                                 NULL, cost_vec)))
6793         {
6794           if (dump_enabled_p ())
6795             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6796                              "unsupported shift or rotation in reduction\n");
6797           return false;
6798         }
6799     }
6800   else
6801     {
6802       /* 4. Supportable by target?  */
6803
6804       /* 4.1. check support for the operation in the loop  */
6805       optab = optab_for_tree_code (code, vectype_in, optab_default);
6806       if (!optab)
6807         {
6808           if (dump_enabled_p ())
6809             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6810                              "no optab.\n");
6811
6812           return false;
6813         }
6814
6815       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6816         {
6817           if (dump_enabled_p ())
6818             dump_printf (MSG_NOTE, "op not supported by target.\n");
6819
6820           if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
6821               || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6822             return false;
6823
6824           if (dump_enabled_p ())
6825             dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6826         }
6827
6828       /* Worthwhile without SIMD support?  */
6829       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
6830           && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6831         {
6832           if (dump_enabled_p ())
6833             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6834                              "not worthwhile without SIMD support.\n");
6835
6836           return false;
6837         }
6838     }
6839
6840   /* 4.2. Check support for the epilog operation.
6841
6842           If STMT represents a reduction pattern, then the type of the
6843           reduction variable may be different than the type of the rest
6844           of the arguments.  For example, consider the case of accumulation
6845           of shorts into an int accumulator; The original code:
6846                         S1: int_a = (int) short_a;
6847           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
6848
6849           was replaced with:
6850                         STMT: int_acc = widen_sum <short_a, int_acc>
6851
6852           This means that:
6853           1. The tree-code that is used to create the vector operation in the
6854              epilog code (that reduces the partial results) is not the
6855              tree-code of STMT, but is rather the tree-code of the original
6856              stmt from the pattern that STMT is replacing.  I.e, in the example
6857              above we want to use 'widen_sum' in the loop, but 'plus' in the
6858              epilog.
6859           2. The type (mode) we use to check available target support
6860              for the vector operation to be created in the *epilog*, is
6861              determined by the type of the reduction variable (in the example
6862              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6863              However the type (mode) we use to check available target support
6864              for the vector operation to be created *inside the loop*, is
6865              determined by the type of the other arguments to STMT (in the
6866              example we'd check this: optab_handler (widen_sum_optab,
6867              vect_short_mode)).
6868
6869           This is contrary to "regular" reductions, in which the types of all
6870           the arguments are the same as the type of the reduction variable.
6871           For "regular" reductions we can therefore use the same vector type
6872           (and also the same tree-code) when generating the epilog code and
6873           when generating the code inside the loop.  */
6874
6875   if (orig_stmt_info
6876       && (reduction_type == TREE_CODE_REDUCTION
6877           || reduction_type == FOLD_LEFT_REDUCTION))
6878     {
6879       /* This is a reduction pattern: get the vectype from the type of the
6880          reduction variable, and get the tree-code from orig_stmt.  */
6881       orig_code = gimple_assign_rhs_code (orig_stmt_info->stmt);
6882       gcc_assert (vectype_out);
6883       vec_mode = TYPE_MODE (vectype_out);
6884     }
6885   else
6886     {
6887       /* Regular reduction: use the same vectype and tree-code as used for
6888          the vector code inside the loop can be used for the epilog code. */
6889       orig_code = code;
6890
6891       if (code == MINUS_EXPR)
6892         orig_code = PLUS_EXPR;
6893
6894       /* For simple condition reductions, replace with the actual expression
6895          we want to base our reduction around.  */
6896       if (reduction_type == CONST_COND_REDUCTION)
6897         {
6898           orig_code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
6899           gcc_assert (orig_code == MAX_EXPR || orig_code == MIN_EXPR);
6900         }
6901       else if (reduction_type == INTEGER_INDUC_COND_REDUCTION)
6902         orig_code = cond_reduc_op_code;
6903     }
6904
6905   reduc_fn = IFN_LAST;
6906
6907   if (reduction_type == TREE_CODE_REDUCTION
6908       || reduction_type == FOLD_LEFT_REDUCTION
6909       || reduction_type == INTEGER_INDUC_COND_REDUCTION
6910       || reduction_type == CONST_COND_REDUCTION)
6911     {
6912       if (reduction_type == FOLD_LEFT_REDUCTION
6913           ? fold_left_reduction_fn (orig_code, &reduc_fn)
6914           : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6915         {
6916           if (reduc_fn != IFN_LAST
6917               && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6918                                                   OPTIMIZE_FOR_SPEED))
6919             {
6920               if (dump_enabled_p ())
6921                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6922                                  "reduc op not supported by target.\n");
6923
6924               reduc_fn = IFN_LAST;
6925             }
6926         }
6927       else
6928         {
6929           if (!nested_cycle || double_reduc)
6930             {
6931               if (dump_enabled_p ())
6932                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6933                                  "no reduc code for scalar code.\n");
6934
6935               return false;
6936             }
6937         }
6938     }
6939   else if (reduction_type == COND_REDUCTION)
6940     {
6941       int scalar_precision
6942         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6943       cr_index_scalar_type = make_unsigned_type (scalar_precision);
6944       cr_index_vector_type = build_vector_type (cr_index_scalar_type,
6945                                                 nunits_out);
6946
6947       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
6948                                           OPTIMIZE_FOR_SPEED))
6949         reduc_fn = IFN_REDUC_MAX;
6950     }
6951
6952   if (reduction_type != EXTRACT_LAST_REDUCTION
6953       && (!nested_cycle || double_reduc)
6954       && reduc_fn == IFN_LAST
6955       && !nunits_out.is_constant ())
6956     {
6957       if (dump_enabled_p ())
6958         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6959                          "missing target support for reduction on"
6960                          " variable-length vectors.\n");
6961       return false;
6962     }
6963
6964   /* For SLP reductions, see if there is a neutral value we can use.  */
6965   tree neutral_op = NULL_TREE;
6966   if (slp_node)
6967     neutral_op = neutral_op_for_slp_reduction
6968       (slp_node_instance->reduc_phis, code,
6969        REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL);
6970
6971   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
6972     {
6973       /* We can't support in-order reductions of code such as this:
6974
6975            for (int i = 0; i < n1; ++i)
6976              for (int j = 0; j < n2; ++j)
6977                l += a[j];
6978
6979          since GCC effectively transforms the loop when vectorizing:
6980
6981            for (int i = 0; i < n1 / VF; ++i)
6982              for (int j = 0; j < n2; ++j)
6983                for (int k = 0; k < VF; ++k)
6984                  l += a[j];
6985
6986          which is a reassociation of the original operation.  */
6987       if (dump_enabled_p ())
6988         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6989                          "in-order double reduction not supported.\n");
6990
6991       return false;
6992     }
6993
6994   if (reduction_type == FOLD_LEFT_REDUCTION
6995       && slp_node
6996       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6997     {
6998       /* We cannot use in-order reductions in this case because there is
6999          an implicit reassociation of the operations involved.  */
7000       if (dump_enabled_p ())
7001         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7002                          "in-order unchained SLP reductions not supported.\n");
7003       return false;
7004     }
7005
7006   /* For double reductions, and for SLP reductions with a neutral value,
7007      we construct a variable-length initial vector by loading a vector
7008      full of the neutral value and then shift-and-inserting the start
7009      values into the low-numbered elements.  */
7010   if ((double_reduc || neutral_op)
7011       && !nunits_out.is_constant ()
7012       && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
7013                                           vectype_out, OPTIMIZE_FOR_SPEED))
7014     {
7015       if (dump_enabled_p ())
7016         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7017                          "reduction on variable-length vectors requires"
7018                          " target support for a vector-shift-and-insert"
7019                          " operation.\n");
7020       return false;
7021     }
7022
7023   /* Check extra constraints for variable-length unchained SLP reductions.  */
7024   if (STMT_SLP_TYPE (stmt_info)
7025       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7026       && !nunits_out.is_constant ())
7027     {
7028       /* We checked above that we could build the initial vector when
7029          there's a neutral element value.  Check here for the case in
7030          which each SLP statement has its own initial value and in which
7031          that value needs to be repeated for every instance of the
7032          statement within the initial vector.  */
7033       unsigned int group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7034       scalar_mode elt_mode = SCALAR_TYPE_MODE (TREE_TYPE (vectype_out));
7035       if (!neutral_op
7036           && !can_duplicate_and_interleave_p (group_size, elt_mode))
7037         {
7038           if (dump_enabled_p ())
7039             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7040                              "unsupported form of SLP reduction for"
7041                              " variable-length vectors: cannot build"
7042                              " initial vector.\n");
7043           return false;
7044         }
7045       /* The epilogue code relies on the number of elements being a multiple
7046          of the group size.  The duplicate-and-interleave approach to setting
7047          up the the initial vector does too.  */
7048       if (!multiple_p (nunits_out, group_size))
7049         {
7050           if (dump_enabled_p ())
7051             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7052                              "unsupported form of SLP reduction for"
7053                              " variable-length vectors: the vector size"
7054                              " is not a multiple of the number of results.\n");
7055           return false;
7056         }
7057     }
7058
7059   /* In case of widenning multiplication by a constant, we update the type
7060      of the constant to be the type of the other operand.  We check that the
7061      constant fits the type in the pattern recognition pass.  */
7062   if (code == DOT_PROD_EXPR
7063       && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
7064     {
7065       if (TREE_CODE (ops[0]) == INTEGER_CST)
7066         ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
7067       else if (TREE_CODE (ops[1]) == INTEGER_CST)
7068         ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
7069       else
7070         {
7071           if (dump_enabled_p ())
7072             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7073                              "invalid types in dot-prod\n");
7074
7075           return false;
7076         }
7077     }
7078
7079   if (reduction_type == COND_REDUCTION)
7080     {
7081       widest_int ni;
7082
7083       if (! max_loop_iterations (loop, &ni))
7084         {
7085           if (dump_enabled_p ())
7086             dump_printf_loc (MSG_NOTE, vect_location,
7087                              "loop count not known, cannot create cond "
7088                              "reduction.\n");
7089           return false;
7090         }
7091       /* Convert backedges to iterations.  */
7092       ni += 1;
7093
7094       /* The additional index will be the same type as the condition.  Check
7095          that the loop can fit into this less one (because we'll use up the
7096          zero slot for when there are no matches).  */
7097       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
7098       if (wi::geu_p (ni, wi::to_widest (max_index)))
7099         {
7100           if (dump_enabled_p ())
7101             dump_printf_loc (MSG_NOTE, vect_location,
7102                              "loop size is greater than data size.\n");
7103           return false;
7104         }
7105     }
7106
7107   /* In case the vectorization factor (VF) is bigger than the number
7108      of elements that we can fit in a vectype (nunits), we have to generate
7109      more than one vector stmt - i.e - we need to "unroll" the
7110      vector stmt by a factor VF/nunits.  For more details see documentation
7111      in vectorizable_operation.  */
7112
7113   /* If the reduction is used in an outer loop we need to generate
7114      VF intermediate results, like so (e.g. for ncopies=2):
7115         r0 = phi (init, r0)
7116         r1 = phi (init, r1)
7117         r0 = x0 + r0;
7118         r1 = x1 + r1;
7119     (i.e. we generate VF results in 2 registers).
7120     In this case we have a separate def-use cycle for each copy, and therefore
7121     for each copy we get the vector def for the reduction variable from the
7122     respective phi node created for this copy.
7123
7124     Otherwise (the reduction is unused in the loop nest), we can combine
7125     together intermediate results, like so (e.g. for ncopies=2):
7126         r = phi (init, r)
7127         r = x0 + r;
7128         r = x1 + r;
7129    (i.e. we generate VF/2 results in a single register).
7130    In this case for each copy we get the vector def for the reduction variable
7131    from the vectorized reduction operation generated in the previous iteration.
7132
7133    This only works when we see both the reduction PHI and its only consumer
7134    in vectorizable_reduction and there are no intermediate stmts
7135    participating.  */
7136   stmt_vec_info use_stmt_info;
7137   tree reduc_phi_result = gimple_phi_result (reduc_def_phi);
7138   if (ncopies > 1
7139       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
7140       && (use_stmt_info = loop_vinfo->lookup_single_use (reduc_phi_result))
7141       && vect_stmt_to_vectorize (use_stmt_info) == stmt_info)
7142     {
7143       single_defuse_cycle = true;
7144       epilog_copies = 1;
7145     }
7146   else
7147     epilog_copies = ncopies;
7148
7149   /* If the reduction stmt is one of the patterns that have lane
7150      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
7151   if ((ncopies > 1
7152        && ! single_defuse_cycle)
7153       && (code == DOT_PROD_EXPR
7154           || code == WIDEN_SUM_EXPR
7155           || code == SAD_EXPR))
7156     {
7157       if (dump_enabled_p ())
7158         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7159                          "multi def-use cycle not possible for lane-reducing "
7160                          "reduction operation\n");
7161       return false;
7162     }
7163
7164   if (slp_node)
7165     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7166   else
7167     vec_num = 1;
7168
7169   internal_fn cond_fn = get_conditional_internal_fn (code);
7170   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7171   bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
7172
7173   if (!vec_stmt) /* transformation not required.  */
7174     {
7175       vect_model_reduction_cost (stmt_info, reduc_fn, ncopies, cost_vec);
7176       if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
7177         {
7178           if (reduction_type != FOLD_LEFT_REDUCTION
7179               && !mask_by_cond_expr
7180               && (cond_fn == IFN_LAST
7181                   || !direct_internal_fn_supported_p (cond_fn, vectype_in,
7182                                                       OPTIMIZE_FOR_SPEED)))
7183             {
7184               if (dump_enabled_p ())
7185                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7186                                  "can't use a fully-masked loop because no"
7187                                  " conditional operation is available.\n");
7188               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7189             }
7190           else if (reduc_index == -1)
7191             {
7192               if (dump_enabled_p ())
7193                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7194                                  "can't use a fully-masked loop for chained"
7195                                  " reductions.\n");
7196               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7197             }
7198           else
7199             vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
7200                                    vectype_in);
7201         }
7202       if (dump_enabled_p ()
7203           && reduction_type == FOLD_LEFT_REDUCTION)
7204         dump_printf_loc (MSG_NOTE, vect_location,
7205                          "using an in-order (fold-left) reduction.\n");
7206       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7207       return true;
7208     }
7209
7210   /* Transform.  */
7211
7212   if (dump_enabled_p ())
7213     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7214
7215   /* FORNOW: Multiple types are not supported for condition.  */
7216   if (code == COND_EXPR)
7217     gcc_assert (ncopies == 1);
7218
7219   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7220
7221   if (reduction_type == FOLD_LEFT_REDUCTION)
7222     return vectorize_fold_left_reduction
7223       (stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, code,
7224        reduc_fn, ops, vectype_in, reduc_index, masks);
7225
7226   if (reduction_type == EXTRACT_LAST_REDUCTION)
7227     {
7228       gcc_assert (!slp_node);
7229       return vectorizable_condition (stmt_info, gsi, vec_stmt,
7230                                      true, NULL, NULL);
7231     }
7232
7233   /* Create the destination vector  */
7234   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7235
7236   prev_stmt_info = NULL;
7237   prev_phi_info = NULL;
7238   if (!slp_node)
7239     {
7240       vec_oprnds0.create (1);
7241       vec_oprnds1.create (1);
7242       if (op_type == ternary_op)
7243         vec_oprnds2.create (1);
7244     }
7245
7246   phis.create (vec_num);
7247   vect_defs.create (vec_num);
7248   if (!slp_node)
7249     vect_defs.quick_push (NULL_TREE);
7250
7251   if (slp_node)
7252     phis.splice (SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis));
7253   else
7254     phis.quick_push (STMT_VINFO_VEC_STMT (reduc_def_info));
7255
7256   for (j = 0; j < ncopies; j++)
7257     {
7258       if (code == COND_EXPR)
7259         {
7260           gcc_assert (!slp_node);
7261           vectorizable_condition (stmt_info, gsi, vec_stmt,
7262                                   true, NULL, NULL);
7263           break;
7264         }
7265       if (code == LSHIFT_EXPR
7266           || code == RSHIFT_EXPR)
7267         {
7268           vectorizable_shift (stmt_info, gsi, vec_stmt, slp_node, NULL);
7269           break;
7270         }
7271
7272       /* Handle uses.  */
7273       if (j == 0)
7274         {
7275           if (slp_node)
7276             {
7277               /* Get vec defs for all the operands except the reduction index,
7278                  ensuring the ordering of the ops in the vector is kept.  */
7279               auto_vec<tree, 3> slp_ops;
7280               auto_vec<vec<tree>, 3> vec_defs;
7281
7282               slp_ops.quick_push (ops[0]);
7283               slp_ops.quick_push (ops[1]);
7284               if (op_type == ternary_op)
7285                 slp_ops.quick_push (ops[2]);
7286
7287               vect_get_slp_defs (slp_ops, slp_node, &vec_defs);
7288
7289               vec_oprnds0.safe_splice (vec_defs[0]);
7290               vec_defs[0].release ();
7291               vec_oprnds1.safe_splice (vec_defs[1]);
7292               vec_defs[1].release ();
7293               if (op_type == ternary_op)
7294                 {
7295                   vec_oprnds2.safe_splice (vec_defs[2]);
7296                   vec_defs[2].release ();
7297                 }
7298             }
7299           else
7300             {
7301               vec_oprnds0.quick_push
7302                 (vect_get_vec_def_for_operand (ops[0], stmt_info));
7303               vec_oprnds1.quick_push
7304                 (vect_get_vec_def_for_operand (ops[1], stmt_info));
7305               if (op_type == ternary_op)
7306                 vec_oprnds2.quick_push
7307                   (vect_get_vec_def_for_operand (ops[2], stmt_info));
7308             }
7309         }
7310       else
7311         {
7312           if (!slp_node)
7313             {
7314               gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
7315
7316               if (single_defuse_cycle && reduc_index == 0)
7317                 vec_oprnds0[0] = gimple_get_lhs (new_stmt_info->stmt);
7318               else
7319                 vec_oprnds0[0]
7320                   = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7321                                                     vec_oprnds0[0]);
7322               if (single_defuse_cycle && reduc_index == 1)
7323                 vec_oprnds1[0] = gimple_get_lhs (new_stmt_info->stmt);
7324               else
7325                 vec_oprnds1[0]
7326                   = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7327                                                     vec_oprnds1[0]);
7328               if (op_type == ternary_op)
7329                 {
7330                   if (single_defuse_cycle && reduc_index == 2)
7331                     vec_oprnds2[0] = gimple_get_lhs (new_stmt_info->stmt);
7332                   else
7333                     vec_oprnds2[0]
7334                       = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7335                                                         vec_oprnds2[0]);
7336                 }
7337             }
7338         }
7339
7340       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7341         {
7342           tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7343           if (masked_loop_p && !mask_by_cond_expr)
7344             {
7345               /* Make sure that the reduction accumulator is vop[0].  */
7346               if (reduc_index == 1)
7347                 {
7348                   gcc_assert (commutative_tree_code (code));
7349                   std::swap (vop[0], vop[1]);
7350                 }
7351               tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7352                                               vectype_in, i * ncopies + j);
7353               gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
7354                                                         vop[0], vop[1],
7355                                                         vop[0]);
7356               new_temp = make_ssa_name (vec_dest, call);
7357               gimple_call_set_lhs (call, new_temp);
7358               gimple_call_set_nothrow (call, true);
7359               new_stmt_info
7360                 = vect_finish_stmt_generation (stmt_info, call, gsi);
7361             }
7362           else
7363             {
7364               if (op_type == ternary_op)
7365                 vop[2] = vec_oprnds2[i];
7366
7367               if (masked_loop_p && mask_by_cond_expr)
7368                 {
7369                   tree mask = vect_get_loop_mask (gsi, masks,
7370                                                   vec_num * ncopies,
7371                                                   vectype_in, i * ncopies + j);
7372                   build_vect_cond_expr (code, vop, mask, gsi);
7373                 }
7374
7375               gassign *new_stmt = gimple_build_assign (vec_dest, code,
7376                                                        vop[0], vop[1], vop[2]);
7377               new_temp = make_ssa_name (vec_dest, new_stmt);
7378               gimple_assign_set_lhs (new_stmt, new_temp);
7379               new_stmt_info
7380                 = vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
7381             }
7382
7383           if (slp_node)
7384             {
7385               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
7386               vect_defs.quick_push (new_temp);
7387             }
7388           else
7389             vect_defs[0] = new_temp;
7390         }
7391
7392       if (slp_node)
7393         continue;
7394
7395       if (j == 0)
7396         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
7397       else
7398         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
7399
7400       prev_stmt_info = new_stmt_info;
7401     }
7402
7403   /* Finalize the reduction-phi (set its arguments) and create the
7404      epilog reduction code.  */
7405   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
7406     vect_defs[0] = gimple_get_lhs ((*vec_stmt)->stmt);
7407
7408   vect_create_epilog_for_reduction (vect_defs, stmt_info, reduc_def_phi,
7409                                     epilog_copies, reduc_fn, phis,
7410                                     double_reduc, slp_node, slp_node_instance,
7411                                     cond_reduc_val, cond_reduc_op_code,
7412                                     neutral_op);
7413
7414   return true;
7415 }
7416
7417 /* Function vect_min_worthwhile_factor.
7418
7419    For a loop where we could vectorize the operation indicated by CODE,
7420    return the minimum vectorization factor that makes it worthwhile
7421    to use generic vectors.  */
7422 static unsigned int
7423 vect_min_worthwhile_factor (enum tree_code code)
7424 {
7425   switch (code)
7426     {
7427     case PLUS_EXPR:
7428     case MINUS_EXPR:
7429     case NEGATE_EXPR:
7430       return 4;
7431
7432     case BIT_AND_EXPR:
7433     case BIT_IOR_EXPR:
7434     case BIT_XOR_EXPR:
7435     case BIT_NOT_EXPR:
7436       return 2;
7437
7438     default:
7439       return INT_MAX;
7440     }
7441 }
7442
7443 /* Return true if VINFO indicates we are doing loop vectorization and if
7444    it is worth decomposing CODE operations into scalar operations for
7445    that loop's vectorization factor.  */
7446
7447 bool
7448 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
7449 {
7450   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7451   unsigned HOST_WIDE_INT value;
7452   return (loop_vinfo
7453           && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7454           && value >= vect_min_worthwhile_factor (code));
7455 }
7456
7457 /* Function vectorizable_induction
7458
7459    Check if STMT_INFO performs an induction computation that can be vectorized.
7460    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7461    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7462    Return true if STMT_INFO is vectorizable in this way.  */
7463
7464 bool
7465 vectorizable_induction (stmt_vec_info stmt_info,
7466                         gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7467                         stmt_vec_info *vec_stmt, slp_tree slp_node,
7468                         stmt_vector_for_cost *cost_vec)
7469 {
7470   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7471   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7472   unsigned ncopies;
7473   bool nested_in_vect_loop = false;
7474   struct loop *iv_loop;
7475   tree vec_def;
7476   edge pe = loop_preheader_edge (loop);
7477   basic_block new_bb;
7478   tree new_vec, vec_init, vec_step, t;
7479   tree new_name;
7480   gimple *new_stmt;
7481   gphi *induction_phi;
7482   tree induc_def, vec_dest;
7483   tree init_expr, step_expr;
7484   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7485   unsigned i;
7486   tree expr;
7487   gimple_seq stmts;
7488   imm_use_iterator imm_iter;
7489   use_operand_p use_p;
7490   gimple *exit_phi;
7491   edge latch_e;
7492   tree loop_arg;
7493   gimple_stmt_iterator si;
7494
7495   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
7496   if (!phi)
7497     return false;
7498
7499   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7500     return false;
7501
7502   /* Make sure it was recognized as induction computation.  */
7503   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7504     return false;
7505
7506   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7507   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7508
7509   if (slp_node)
7510     ncopies = 1;
7511   else
7512     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7513   gcc_assert (ncopies >= 1);
7514
7515   /* FORNOW. These restrictions should be relaxed.  */
7516   if (nested_in_vect_loop_p (loop, stmt_info))
7517     {
7518       imm_use_iterator imm_iter;
7519       use_operand_p use_p;
7520       gimple *exit_phi;
7521       edge latch_e;
7522       tree loop_arg;
7523
7524       if (ncopies > 1)
7525         {
7526           if (dump_enabled_p ())
7527             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7528                              "multiple types in nested loop.\n");
7529           return false;
7530         }
7531
7532       /* FORNOW: outer loop induction with SLP not supported.  */
7533       if (STMT_SLP_TYPE (stmt_info))
7534         return false;
7535
7536       exit_phi = NULL;
7537       latch_e = loop_latch_edge (loop->inner);
7538       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7539       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7540         {
7541           gimple *use_stmt = USE_STMT (use_p);
7542           if (is_gimple_debug (use_stmt))
7543             continue;
7544
7545           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7546             {
7547               exit_phi = use_stmt;
7548               break;
7549             }
7550         }
7551       if (exit_phi)
7552         {
7553           stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7554           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7555                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7556             {
7557               if (dump_enabled_p ())
7558                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7559                                  "inner-loop induction only used outside "
7560                                  "of the outer vectorized loop.\n");
7561               return false;
7562             }
7563         }
7564
7565       nested_in_vect_loop = true;
7566       iv_loop = loop->inner;
7567     }
7568   else
7569     iv_loop = loop;
7570   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7571
7572   if (slp_node && !nunits.is_constant ())
7573     {
7574       /* The current SLP code creates the initial value element-by-element.  */
7575       if (dump_enabled_p ())
7576         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7577                          "SLP induction not supported for variable-length"
7578                          " vectors.\n");
7579       return false;
7580     }
7581
7582   if (!vec_stmt) /* transformation not required.  */
7583     {
7584       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
7585       DUMP_VECT_SCOPE ("vectorizable_induction");
7586       vect_model_induction_cost (stmt_info, ncopies, cost_vec);
7587       return true;
7588     }
7589
7590   /* Transform.  */
7591
7592   /* Compute a vector variable, initialized with the first VF values of
7593      the induction variable.  E.g., for an iv with IV_PHI='X' and
7594      evolution S, for a vector of 4 units, we want to compute:
7595      [X, X + S, X + 2*S, X + 3*S].  */
7596
7597   if (dump_enabled_p ())
7598     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
7599
7600   latch_e = loop_latch_edge (iv_loop);
7601   loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7602
7603   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
7604   gcc_assert (step_expr != NULL_TREE);
7605
7606   pe = loop_preheader_edge (iv_loop);
7607   init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
7608                                      loop_preheader_edge (iv_loop));
7609
7610   stmts = NULL;
7611   if (!nested_in_vect_loop)
7612     {
7613       /* Convert the initial value to the desired type.  */
7614       tree new_type = TREE_TYPE (vectype);
7615       init_expr = gimple_convert (&stmts, new_type, init_expr);
7616
7617       /* If we are using the loop mask to "peel" for alignment then we need
7618          to adjust the start value here.  */
7619       tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
7620       if (skip_niters != NULL_TREE)
7621         {
7622           if (FLOAT_TYPE_P (vectype))
7623             skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
7624                                         skip_niters);
7625           else
7626             skip_niters = gimple_convert (&stmts, new_type, skip_niters);
7627           tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
7628                                          skip_niters, step_expr);
7629           init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
7630                                     init_expr, skip_step);
7631         }
7632     }
7633
7634   /* Convert the step to the desired type.  */
7635   step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
7636
7637   if (stmts)
7638     {
7639       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7640       gcc_assert (!new_bb);
7641     }
7642
7643   /* Find the first insertion point in the BB.  */
7644   basic_block bb = gimple_bb (phi);
7645   si = gsi_after_labels (bb);
7646
7647   /* For SLP induction we have to generate several IVs as for example
7648      with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
7649      [i + 2*S, i + 3*S, i + 3*S, i + 3*S].  The step is the same uniform
7650      [VF*S, VF*S, VF*S, VF*S] for all.  */
7651   if (slp_node)
7652     {
7653       /* Enforced above.  */
7654       unsigned int const_nunits = nunits.to_constant ();
7655
7656       /* Generate [VF*S, VF*S, ... ].  */
7657       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7658         {
7659           expr = build_int_cst (integer_type_node, vf);
7660           expr = fold_convert (TREE_TYPE (step_expr), expr);
7661         }
7662       else
7663         expr = build_int_cst (TREE_TYPE (step_expr), vf);
7664       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7665                               expr, step_expr);
7666       if (! CONSTANT_CLASS_P (new_name))
7667         new_name = vect_init_vector (stmt_info, new_name,
7668                                      TREE_TYPE (step_expr), NULL);
7669       new_vec = build_vector_from_val (vectype, new_name);
7670       vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7671
7672       /* Now generate the IVs.  */
7673       unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7674       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7675       unsigned elts = const_nunits * nvects;
7676       unsigned nivs = least_common_multiple (group_size,
7677                                              const_nunits) / const_nunits;
7678       gcc_assert (elts % group_size == 0);
7679       tree elt = init_expr;
7680       unsigned ivn;
7681       for (ivn = 0; ivn < nivs; ++ivn)
7682         {
7683           tree_vector_builder elts (vectype, const_nunits, 1);
7684           stmts = NULL;
7685           for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
7686             {
7687               if (ivn*const_nunits + eltn >= group_size
7688                   && (ivn * const_nunits + eltn) % group_size == 0)
7689                 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
7690                                     elt, step_expr);
7691               elts.quick_push (elt);
7692             }
7693           vec_init = gimple_build_vector (&stmts, &elts);
7694           if (stmts)
7695             {
7696               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7697               gcc_assert (!new_bb);
7698             }
7699
7700           /* Create the induction-phi that defines the induction-operand.  */
7701           vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7702           induction_phi = create_phi_node (vec_dest, iv_loop->header);
7703           stmt_vec_info induction_phi_info
7704             = loop_vinfo->add_stmt (induction_phi);
7705           induc_def = PHI_RESULT (induction_phi);
7706
7707           /* Create the iv update inside the loop  */
7708           vec_def = make_ssa_name (vec_dest);
7709           new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7710           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7711           loop_vinfo->add_stmt (new_stmt);
7712
7713           /* Set the arguments of the phi node:  */
7714           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7715           add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7716                        UNKNOWN_LOCATION);
7717
7718           SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi_info);
7719         }
7720
7721       /* Re-use IVs when we can.  */
7722       if (ivn < nvects)
7723         {
7724           unsigned vfp
7725             = least_common_multiple (group_size, const_nunits) / group_size;
7726           /* Generate [VF'*S, VF'*S, ... ].  */
7727           if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7728             {
7729               expr = build_int_cst (integer_type_node, vfp);
7730               expr = fold_convert (TREE_TYPE (step_expr), expr);
7731             }
7732           else
7733             expr = build_int_cst (TREE_TYPE (step_expr), vfp);
7734           new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7735                                   expr, step_expr);
7736           if (! CONSTANT_CLASS_P (new_name))
7737             new_name = vect_init_vector (stmt_info, new_name,
7738                                          TREE_TYPE (step_expr), NULL);
7739           new_vec = build_vector_from_val (vectype, new_name);
7740           vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7741           for (; ivn < nvects; ++ivn)
7742             {
7743               gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs]->stmt;
7744               tree def;
7745               if (gimple_code (iv) == GIMPLE_PHI)
7746                 def = gimple_phi_result (iv);
7747               else
7748                 def = gimple_assign_lhs (iv);
7749               new_stmt = gimple_build_assign (make_ssa_name (vectype),
7750                                               PLUS_EXPR,
7751                                               def, vec_step);
7752               if (gimple_code (iv) == GIMPLE_PHI)
7753                 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7754               else
7755                 {
7756                   gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
7757                   gsi_insert_after (&tgsi, new_stmt, GSI_CONTINUE_LINKING);
7758                 }
7759               SLP_TREE_VEC_STMTS (slp_node).quick_push
7760                 (loop_vinfo->add_stmt (new_stmt));
7761             }
7762         }
7763
7764       return true;
7765     }
7766
7767   /* Create the vector that holds the initial_value of the induction.  */
7768   if (nested_in_vect_loop)
7769     {
7770       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
7771          been created during vectorization of previous stmts.  We obtain it
7772          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
7773       vec_init = vect_get_vec_def_for_operand (init_expr, stmt_info);
7774       /* If the initial value is not of proper type, convert it.  */
7775       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
7776         {
7777           new_stmt
7778             = gimple_build_assign (vect_get_new_ssa_name (vectype,
7779                                                           vect_simple_var,
7780                                                           "vec_iv_"),
7781                                    VIEW_CONVERT_EXPR,
7782                                    build1 (VIEW_CONVERT_EXPR, vectype,
7783                                            vec_init));
7784           vec_init = gimple_assign_lhs (new_stmt);
7785           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
7786                                                  new_stmt);
7787           gcc_assert (!new_bb);
7788           loop_vinfo->add_stmt (new_stmt);
7789         }
7790     }
7791   else
7792     {
7793       /* iv_loop is the loop to be vectorized. Create:
7794          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
7795       stmts = NULL;
7796       new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
7797
7798       unsigned HOST_WIDE_INT const_nunits;
7799       if (nunits.is_constant (&const_nunits))
7800         {
7801           tree_vector_builder elts (vectype, const_nunits, 1);
7802           elts.quick_push (new_name);
7803           for (i = 1; i < const_nunits; i++)
7804             {
7805               /* Create: new_name_i = new_name + step_expr  */
7806               new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
7807                                        new_name, step_expr);
7808               elts.quick_push (new_name);
7809             }
7810           /* Create a vector from [new_name_0, new_name_1, ...,
7811              new_name_nunits-1]  */
7812           vec_init = gimple_build_vector (&stmts, &elts);
7813         }
7814       else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
7815         /* Build the initial value directly from a VEC_SERIES_EXPR.  */
7816         vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, vectype,
7817                                  new_name, step_expr);
7818       else
7819         {
7820           /* Build:
7821                 [base, base, base, ...]
7822                 + (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
7823           gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
7824           gcc_assert (flag_associative_math);
7825           tree index = build_index_vector (vectype, 0, 1);
7826           tree base_vec = gimple_build_vector_from_val (&stmts, vectype,
7827                                                         new_name);
7828           tree step_vec = gimple_build_vector_from_val (&stmts, vectype,
7829                                                         step_expr);
7830           vec_init = gimple_build (&stmts, FLOAT_EXPR, vectype, index);
7831           vec_init = gimple_build (&stmts, MULT_EXPR, vectype,
7832                                    vec_init, step_vec);
7833           vec_init = gimple_build (&stmts, PLUS_EXPR, vectype,
7834                                    vec_init, base_vec);
7835         }
7836
7837       if (stmts)
7838         {
7839           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7840           gcc_assert (!new_bb);
7841         }
7842     }
7843
7844
7845   /* Create the vector that holds the step of the induction.  */
7846   if (nested_in_vect_loop)
7847     /* iv_loop is nested in the loop to be vectorized. Generate:
7848        vec_step = [S, S, S, S]  */
7849     new_name = step_expr;
7850   else
7851     {
7852       /* iv_loop is the loop to be vectorized. Generate:
7853           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
7854       gimple_seq seq = NULL;
7855       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7856         {
7857           expr = build_int_cst (integer_type_node, vf);
7858           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7859         }
7860       else
7861         expr = build_int_cst (TREE_TYPE (step_expr), vf);
7862       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7863                                expr, step_expr);
7864       if (seq)
7865         {
7866           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7867           gcc_assert (!new_bb);
7868         }
7869     }
7870
7871   t = unshare_expr (new_name);
7872   gcc_assert (CONSTANT_CLASS_P (new_name)
7873               || TREE_CODE (new_name) == SSA_NAME);
7874   new_vec = build_vector_from_val (vectype, t);
7875   vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7876
7877
7878   /* Create the following def-use cycle:
7879      loop prolog:
7880          vec_init = ...
7881          vec_step = ...
7882      loop:
7883          vec_iv = PHI <vec_init, vec_loop>
7884          ...
7885          STMT
7886          ...
7887          vec_loop = vec_iv + vec_step;  */
7888
7889   /* Create the induction-phi that defines the induction-operand.  */
7890   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7891   induction_phi = create_phi_node (vec_dest, iv_loop->header);
7892   stmt_vec_info induction_phi_info = loop_vinfo->add_stmt (induction_phi);
7893   induc_def = PHI_RESULT (induction_phi);
7894
7895   /* Create the iv update inside the loop  */
7896   vec_def = make_ssa_name (vec_dest);
7897   new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7898   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7899   stmt_vec_info new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7900
7901   /* Set the arguments of the phi node:  */
7902   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7903   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7904                UNKNOWN_LOCATION);
7905
7906   STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi_info;
7907
7908   /* In case that vectorization factor (VF) is bigger than the number
7909      of elements that we can fit in a vectype (nunits), we have to generate
7910      more than one vector stmt - i.e - we need to "unroll" the
7911      vector stmt by a factor VF/nunits.  For more details see documentation
7912      in vectorizable_operation.  */
7913
7914   if (ncopies > 1)
7915     {
7916       gimple_seq seq = NULL;
7917       stmt_vec_info prev_stmt_vinfo;
7918       /* FORNOW. This restriction should be relaxed.  */
7919       gcc_assert (!nested_in_vect_loop);
7920
7921       /* Create the vector that holds the step of the induction.  */
7922       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7923         {
7924           expr = build_int_cst (integer_type_node, nunits);
7925           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7926         }
7927       else
7928         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
7929       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7930                                expr, step_expr);
7931       if (seq)
7932         {
7933           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7934           gcc_assert (!new_bb);
7935         }
7936
7937       t = unshare_expr (new_name);
7938       gcc_assert (CONSTANT_CLASS_P (new_name)
7939                   || TREE_CODE (new_name) == SSA_NAME);
7940       new_vec = build_vector_from_val (vectype, t);
7941       vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7942
7943       vec_def = induc_def;
7944       prev_stmt_vinfo = induction_phi_info;
7945       for (i = 1; i < ncopies; i++)
7946         {
7947           /* vec_i = vec_prev + vec_step  */
7948           new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
7949                                           vec_def, vec_step);
7950           vec_def = make_ssa_name (vec_dest, new_stmt);
7951           gimple_assign_set_lhs (new_stmt, vec_def);
7952
7953           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7954           new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7955           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt_info;
7956           prev_stmt_vinfo = new_stmt_info;
7957         }
7958     }
7959
7960   if (nested_in_vect_loop)
7961     {
7962       /* Find the loop-closed exit-phi of the induction, and record
7963          the final vector of induction results:  */
7964       exit_phi = NULL;
7965       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7966         {
7967           gimple *use_stmt = USE_STMT (use_p);
7968           if (is_gimple_debug (use_stmt))
7969             continue;
7970
7971           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
7972             {
7973               exit_phi = use_stmt;
7974               break;
7975             }
7976         }
7977       if (exit_phi)
7978         {
7979           stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7980           /* FORNOW. Currently not supporting the case that an inner-loop induction
7981              is not used in the outer-loop (i.e. only outside the outer-loop).  */
7982           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
7983                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
7984
7985           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt_info;
7986           if (dump_enabled_p ())
7987             dump_printf_loc (MSG_NOTE, vect_location,
7988                              "vector of inductions after inner-loop:%G",
7989                              new_stmt);
7990         }
7991     }
7992
7993
7994   if (dump_enabled_p ())
7995     dump_printf_loc (MSG_NOTE, vect_location,
7996                      "transform induction: created def-use cycle: %G%G",
7997                      induction_phi, SSA_NAME_DEF_STMT (vec_def));
7998
7999   return true;
8000 }
8001
8002 /* Function vectorizable_live_operation.
8003
8004    STMT_INFO computes a value that is used outside the loop.  Check if
8005    it can be supported.  */
8006
8007 bool
8008 vectorizable_live_operation (stmt_vec_info stmt_info,
8009                              gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
8010                              slp_tree slp_node, int slp_index,
8011                              stmt_vec_info *vec_stmt,
8012                              stmt_vector_for_cost *)
8013 {
8014   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
8015   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8016   imm_use_iterator imm_iter;
8017   tree lhs, lhs_type, bitsize, vec_bitsize;
8018   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8019   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8020   int ncopies;
8021   gimple *use_stmt;
8022   auto_vec<tree> vec_oprnds;
8023   int vec_entry = 0;
8024   poly_uint64 vec_index = 0;
8025
8026   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
8027
8028   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
8029     return false;
8030
8031   /* FORNOW.  CHECKME.  */
8032   if (nested_in_vect_loop_p (loop, stmt_info))
8033     return false;
8034
8035   /* If STMT is not relevant and it is a simple assignment and its inputs are
8036      invariant then it can remain in place, unvectorized.  The original last
8037      scalar value that it computes will be used.  */
8038   if (!STMT_VINFO_RELEVANT_P (stmt_info))
8039     {
8040       gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
8041       if (dump_enabled_p ())
8042         dump_printf_loc (MSG_NOTE, vect_location,
8043                          "statement is simple and uses invariant.  Leaving in "
8044                          "place.\n");
8045       return true;
8046     }
8047
8048   if (slp_node)
8049     ncopies = 1;
8050   else
8051     ncopies = vect_get_num_copies (loop_vinfo, vectype);
8052
8053   if (slp_node)
8054     {
8055       gcc_assert (slp_index >= 0);
8056
8057       int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
8058       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8059
8060       /* Get the last occurrence of the scalar index from the concatenation of
8061          all the slp vectors. Calculate which slp vector it is and the index
8062          within.  */
8063       poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
8064
8065       /* Calculate which vector contains the result, and which lane of
8066          that vector we need.  */
8067       if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
8068         {
8069           if (dump_enabled_p ())
8070             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8071                              "Cannot determine which vector holds the"
8072                              " final result.\n");
8073           return false;
8074         }
8075     }
8076
8077   if (!vec_stmt)
8078     {
8079       /* No transformation required.  */
8080       if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
8081         {
8082           if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
8083                                                OPTIMIZE_FOR_SPEED))
8084             {
8085               if (dump_enabled_p ())
8086                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8087                                  "can't use a fully-masked loop because "
8088                                  "the target doesn't support extract last "
8089                                  "reduction.\n");
8090               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
8091             }
8092           else if (slp_node)
8093             {
8094               if (dump_enabled_p ())
8095                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8096                                  "can't use a fully-masked loop because an "
8097                                  "SLP statement is live after the loop.\n");
8098               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
8099             }
8100           else if (ncopies > 1)
8101             {
8102               if (dump_enabled_p ())
8103                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8104                                  "can't use a fully-masked loop because"
8105                                  " ncopies is greater than 1.\n");
8106               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
8107             }
8108           else
8109             {
8110               gcc_assert (ncopies == 1 && !slp_node);
8111               vect_record_loop_mask (loop_vinfo,
8112                                      &LOOP_VINFO_MASKS (loop_vinfo),
8113                                      1, vectype);
8114             }
8115         }
8116       return true;
8117     }
8118
8119   /* Use the lhs of the original scalar statement.  */
8120   gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
8121
8122   lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
8123         : gimple_get_lhs (stmt);
8124   lhs_type = TREE_TYPE (lhs);
8125
8126   bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype)
8127              ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype)))
8128              : TYPE_SIZE (TREE_TYPE (vectype)));
8129   vec_bitsize = TYPE_SIZE (vectype);
8130
8131   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
8132   tree vec_lhs, bitstart;
8133   if (slp_node)
8134     {
8135       gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8136
8137       /* Get the correct slp vectorized stmt.  */
8138       gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry]->stmt;
8139       if (gphi *phi = dyn_cast <gphi *> (vec_stmt))
8140         vec_lhs = gimple_phi_result (phi);
8141       else
8142         vec_lhs = gimple_get_lhs (vec_stmt);
8143
8144       /* Get entry to use.  */
8145       bitstart = bitsize_int (vec_index);
8146       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
8147     }
8148   else
8149     {
8150       enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
8151       vec_lhs = vect_get_vec_def_for_operand_1 (stmt_info, dt);
8152       gcc_checking_assert (ncopies == 1
8153                            || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8154
8155       /* For multiple copies, get the last copy.  */
8156       for (int i = 1; i < ncopies; ++i)
8157         vec_lhs = vect_get_vec_def_for_stmt_copy (loop_vinfo, vec_lhs);
8158
8159       /* Get the last lane in the vector.  */
8160       bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
8161     }
8162
8163   gimple_seq stmts = NULL;
8164   tree new_tree;
8165   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8166     {
8167       /* Emit:
8168
8169            SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
8170
8171          where VEC_LHS is the vectorized live-out result and MASK is
8172          the loop mask for the final iteration.  */
8173       gcc_assert (ncopies == 1 && !slp_node);
8174       tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
8175       tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
8176                                       1, vectype, 0);
8177       tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST,
8178                                       scalar_type, mask, vec_lhs);
8179
8180       /* Convert the extracted vector element to the required scalar type.  */
8181       new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
8182     }
8183   else
8184     {
8185       tree bftype = TREE_TYPE (vectype);
8186       if (VECTOR_BOOLEAN_TYPE_P (vectype))
8187         bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8188       new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs, bitsize, bitstart);
8189       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8190                                        &stmts, true, NULL_TREE);
8191     }
8192
8193   if (stmts)
8194     gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts);
8195
8196   /* Replace use of lhs with newly computed result.  If the use stmt is a
8197      single arg PHI, just replace all uses of PHI result.  It's necessary
8198      because lcssa PHI defining lhs may be before newly inserted stmt.  */
8199   use_operand_p use_p;
8200   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8201     if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
8202         && !is_gimple_debug (use_stmt))
8203     {
8204       if (gimple_code (use_stmt) == GIMPLE_PHI
8205           && gimple_phi_num_args (use_stmt) == 1)
8206         {
8207           replace_uses_by (gimple_phi_result (use_stmt), new_tree);
8208         }
8209       else
8210         {
8211           FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8212             SET_USE (use_p, new_tree);
8213         }
8214       update_stmt (use_stmt);
8215     }
8216
8217   return true;
8218 }
8219
8220 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO.  */
8221
8222 static void
8223 vect_loop_kill_debug_uses (struct loop *loop, stmt_vec_info stmt_info)
8224 {
8225   ssa_op_iter op_iter;
8226   imm_use_iterator imm_iter;
8227   def_operand_p def_p;
8228   gimple *ustmt;
8229
8230   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
8231     {
8232       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
8233         {
8234           basic_block bb;
8235
8236           if (!is_gimple_debug (ustmt))
8237             continue;
8238
8239           bb = gimple_bb (ustmt);
8240
8241           if (!flow_bb_inside_loop_p (loop, bb))
8242             {
8243               if (gimple_debug_bind_p (ustmt))
8244                 {
8245                   if (dump_enabled_p ())
8246                     dump_printf_loc (MSG_NOTE, vect_location,
8247                                      "killing debug use\n");
8248
8249                   gimple_debug_bind_reset_value (ustmt);
8250                   update_stmt (ustmt);
8251                 }
8252               else
8253                 gcc_unreachable ();
8254             }
8255         }
8256     }
8257 }
8258
8259 /* Given loop represented by LOOP_VINFO, return true if computation of
8260    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
8261    otherwise.  */
8262
8263 static bool
8264 loop_niters_no_overflow (loop_vec_info loop_vinfo)
8265 {
8266   /* Constant case.  */
8267   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8268     {
8269       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
8270       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
8271
8272       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
8273       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
8274       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
8275         return true;
8276     }
8277
8278   widest_int max;
8279   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8280   /* Check the upper bound of loop niters.  */
8281   if (get_max_loop_iterations (loop, &max))
8282     {
8283       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
8284       signop sgn = TYPE_SIGN (type);
8285       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
8286       if (max < type_max)
8287         return true;
8288     }
8289   return false;
8290 }
8291
8292 /* Return a mask type with half the number of elements as TYPE.  */
8293
8294 tree
8295 vect_halve_mask_nunits (tree type)
8296 {
8297   poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (type), 2);
8298   return build_truth_vector_type (nunits, current_vector_size);
8299 }
8300
8301 /* Return a mask type with twice as many elements as TYPE.  */
8302
8303 tree
8304 vect_double_mask_nunits (tree type)
8305 {
8306   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (type) * 2;
8307   return build_truth_vector_type (nunits, current_vector_size);
8308 }
8309
8310 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8311    contain a sequence of NVECTORS masks that each control a vector of type
8312    VECTYPE.  */
8313
8314 void
8315 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
8316                        unsigned int nvectors, tree vectype)
8317 {
8318   gcc_assert (nvectors != 0);
8319   if (masks->length () < nvectors)
8320     masks->safe_grow_cleared (nvectors);
8321   rgroup_masks *rgm = &(*masks)[nvectors - 1];
8322   /* The number of scalars per iteration and the number of vectors are
8323      both compile-time constants.  */
8324   unsigned int nscalars_per_iter
8325     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8326                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8327   if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8328     {
8329       rgm->max_nscalars_per_iter = nscalars_per_iter;
8330       rgm->mask_type = build_same_sized_truth_vector_type (vectype);
8331     }
8332 }
8333
8334 /* Given a complete set of masks MASKS, extract mask number INDEX
8335    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8336    where 0 <= INDEX < NVECTORS.  Insert any set-up statements before GSI.
8337
8338    See the comment above vec_loop_masks for more details about the mask
8339    arrangement.  */
8340
8341 tree
8342 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
8343                     unsigned int nvectors, tree vectype, unsigned int index)
8344 {
8345   rgroup_masks *rgm = &(*masks)[nvectors - 1];
8346   tree mask_type = rgm->mask_type;
8347
8348   /* Populate the rgroup's mask array, if this is the first time we've
8349      used it.  */
8350   if (rgm->masks.is_empty ())
8351     {
8352       rgm->masks.safe_grow_cleared (nvectors);
8353       for (unsigned int i = 0; i < nvectors; ++i)
8354         {
8355           tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
8356           /* Provide a dummy definition until the real one is available.  */
8357           SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
8358           rgm->masks[i] = mask;
8359         }
8360     }
8361
8362   tree mask = rgm->masks[index];
8363   if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
8364                 TYPE_VECTOR_SUBPARTS (vectype)))
8365     {
8366       /* A loop mask for data type X can be reused for data type Y
8367          if X has N times more elements than Y and if Y's elements
8368          are N times bigger than X's.  In this case each sequence
8369          of N elements in the loop mask will be all-zero or all-one.
8370          We can then view-convert the mask so that each sequence of
8371          N elements is replaced by a single element.  */
8372       gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
8373                               TYPE_VECTOR_SUBPARTS (vectype)));
8374       gimple_seq seq = NULL;
8375       mask_type = build_same_sized_truth_vector_type (vectype);
8376       mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
8377       if (seq)
8378         gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
8379     }
8380   return mask;
8381 }
8382
8383 /* Scale profiling counters by estimation for LOOP which is vectorized
8384    by factor VF.  */
8385
8386 static void
8387 scale_profile_for_vect_loop (struct loop *loop, unsigned vf)
8388 {
8389   edge preheader = loop_preheader_edge (loop);
8390   /* Reduce loop iterations by the vectorization factor.  */
8391   gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
8392   profile_count freq_h = loop->header->count, freq_e = preheader->count ();
8393
8394   if (freq_h.nonzero_p ())
8395     {
8396       profile_probability p;
8397
8398       /* Avoid dropping loop body profile counter to 0 because of zero count
8399          in loop's preheader.  */
8400       if (!(freq_e == profile_count::zero ()))
8401         freq_e = freq_e.force_nonzero ();
8402       p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
8403       scale_loop_frequencies (loop, p);
8404     }
8405
8406   edge exit_e = single_exit (loop);
8407   exit_e->probability = profile_probability::always ()
8408                                  .apply_scale (1, new_est_niter + 1);
8409
8410   edge exit_l = single_pred_edge (loop->latch);
8411   profile_probability prob = exit_l->probability;
8412   exit_l->probability = exit_e->probability.invert ();
8413   if (prob.initialized_p () && exit_l->probability.initialized_p ())
8414     scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
8415 }
8416
8417 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
8418    When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
8419    stmt_vec_info.  */
8420
8421 static void
8422 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8423                           gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
8424 {
8425   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8426   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8427
8428   if (dump_enabled_p ())
8429     dump_printf_loc (MSG_NOTE, vect_location,
8430                      "------>vectorizing statement: %G", stmt_info->stmt);
8431
8432   if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8433     vect_loop_kill_debug_uses (loop, stmt_info);
8434
8435   if (!STMT_VINFO_RELEVANT_P (stmt_info)
8436       && !STMT_VINFO_LIVE_P (stmt_info))
8437     return;
8438
8439   if (STMT_VINFO_VECTYPE (stmt_info))
8440     {
8441       poly_uint64 nunits
8442         = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8443       if (!STMT_SLP_TYPE (stmt_info)
8444           && maybe_ne (nunits, vf)
8445           && dump_enabled_p ())
8446         /* For SLP VF is set according to unrolling factor, and not
8447            to vector size, hence for SLP this print is not valid.  */
8448         dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8449     }
8450
8451   /* Pure SLP statements have already been vectorized.  We still need
8452      to apply loop vectorization to hybrid SLP statements.  */
8453   if (PURE_SLP_STMT (stmt_info))
8454     return;
8455
8456   if (dump_enabled_p ())
8457     dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
8458
8459   if (vect_transform_stmt (stmt_info, gsi, NULL, NULL))
8460     *seen_store = stmt_info;
8461 }
8462
8463 /* Function vect_transform_loop.
8464
8465    The analysis phase has determined that the loop is vectorizable.
8466    Vectorize the loop - created vectorized stmts to replace the scalar
8467    stmts in the loop, and update the loop exit condition.
8468    Returns scalar epilogue loop if any.  */
8469
8470 struct loop *
8471 vect_transform_loop (loop_vec_info loop_vinfo)
8472 {
8473   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8474   struct loop *epilogue = NULL;
8475   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8476   int nbbs = loop->num_nodes;
8477   int i;
8478   tree niters_vector = NULL_TREE;
8479   tree step_vector = NULL_TREE;
8480   tree niters_vector_mult_vf = NULL_TREE;
8481   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8482   unsigned int lowest_vf = constant_lower_bound (vf);
8483   gimple *stmt;
8484   bool check_profitability = false;
8485   unsigned int th;
8486
8487   DUMP_VECT_SCOPE ("vec_transform_loop");
8488
8489   loop_vinfo->shared->check_datarefs ();
8490
8491   /* Use the more conservative vectorization threshold.  If the number
8492      of iterations is constant assume the cost check has been performed
8493      by our caller.  If the threshold makes all loops profitable that
8494      run at least the (estimated) vectorization factor number of times
8495      checking is pointless, too.  */
8496   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
8497   if (th >= vect_vf_for_cost (loop_vinfo)
8498       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8499     {
8500       if (dump_enabled_p ())
8501         dump_printf_loc (MSG_NOTE, vect_location,
8502                          "Profitability threshold is %d loop iterations.\n",
8503                          th);
8504       check_profitability = true;
8505     }
8506
8507   /* Make sure there exists a single-predecessor exit bb.  Do this before
8508      versioning.   */
8509   edge e = single_exit (loop);
8510   if (! single_pred_p (e->dest))
8511     {
8512       split_loop_exit_edge (e, true);
8513       if (dump_enabled_p ())
8514         dump_printf (MSG_NOTE, "split exit edge\n");
8515     }
8516
8517   /* Version the loop first, if required, so the profitability check
8518      comes first.  */
8519
8520   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
8521     {
8522       poly_uint64 versioning_threshold
8523         = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
8524       if (check_profitability
8525           && ordered_p (poly_uint64 (th), versioning_threshold))
8526         {
8527           versioning_threshold = ordered_max (poly_uint64 (th),
8528                                               versioning_threshold);
8529           check_profitability = false;
8530         }
8531       struct loop *sloop
8532         = vect_loop_versioning (loop_vinfo, th, check_profitability,
8533                                 versioning_threshold);
8534       sloop->force_vectorize = false;
8535       check_profitability = false;
8536     }
8537
8538   /* Make sure there exists a single-predecessor exit bb also on the
8539      scalar loop copy.  Do this after versioning but before peeling
8540      so CFG structure is fine for both scalar and if-converted loop
8541      to make slpeel_duplicate_current_defs_from_edges face matched
8542      loop closed PHI nodes on the exit.  */
8543   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8544     {
8545       e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
8546       if (! single_pred_p (e->dest))
8547         {
8548           split_loop_exit_edge (e, true);
8549           if (dump_enabled_p ())
8550             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
8551         }
8552     }
8553
8554   tree niters = vect_build_loop_niters (loop_vinfo);
8555   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
8556   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
8557   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
8558   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
8559                               &step_vector, &niters_vector_mult_vf, th,
8560                               check_profitability, niters_no_overflow);
8561   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
8562       && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
8563     scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
8564                             LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
8565
8566   if (niters_vector == NULL_TREE)
8567     {
8568       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8569           && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8570           && known_eq (lowest_vf, vf))
8571         {
8572           niters_vector
8573             = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
8574                              LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
8575           step_vector = build_one_cst (TREE_TYPE (niters));
8576         }
8577       else
8578         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
8579                                      &step_vector, niters_no_overflow);
8580     }
8581
8582   /* 1) Make sure the loop header has exactly two entries
8583      2) Make sure we have a preheader basic block.  */
8584
8585   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
8586
8587   split_edge (loop_preheader_edge (loop));
8588
8589   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8590       && vect_use_loop_mask_for_alignment_p (loop_vinfo))
8591     /* This will deal with any possible peeling.  */
8592     vect_prepare_for_masked_peels (loop_vinfo);
8593
8594   /* Schedule the SLP instances first, then handle loop vectorization
8595      below.  */
8596   if (!loop_vinfo->slp_instances.is_empty ())
8597     {
8598       DUMP_VECT_SCOPE ("scheduling SLP instances");
8599       vect_schedule_slp (loop_vinfo);
8600     }
8601
8602   /* FORNOW: the vectorizer supports only loops which body consist
8603      of one basic block (header + empty latch). When the vectorizer will
8604      support more involved loop forms, the order by which the BBs are
8605      traversed need to be reconsidered.  */
8606
8607   for (i = 0; i < nbbs; i++)
8608     {
8609       basic_block bb = bbs[i];
8610       stmt_vec_info stmt_info;
8611
8612       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
8613            gsi_next (&si))
8614         {
8615           gphi *phi = si.phi ();
8616           if (dump_enabled_p ())
8617             dump_printf_loc (MSG_NOTE, vect_location,
8618                              "------>vectorizing phi: %G", phi);
8619           stmt_info = loop_vinfo->lookup_stmt (phi);
8620           if (!stmt_info)
8621             continue;
8622
8623           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8624             vect_loop_kill_debug_uses (loop, stmt_info);
8625
8626           if (!STMT_VINFO_RELEVANT_P (stmt_info)
8627               && !STMT_VINFO_LIVE_P (stmt_info))
8628             continue;
8629
8630           if (STMT_VINFO_VECTYPE (stmt_info)
8631               && (maybe_ne
8632                   (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
8633               && dump_enabled_p ())
8634             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8635
8636           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
8637                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8638                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
8639               && ! PURE_SLP_STMT (stmt_info))
8640             {
8641               if (dump_enabled_p ())
8642                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
8643               vect_transform_stmt (stmt_info, NULL, NULL, NULL);
8644             }
8645         }
8646
8647       for (gimple_stmt_iterator si = gsi_start_bb (bb);
8648            !gsi_end_p (si);)
8649         {
8650           stmt = gsi_stmt (si);
8651           /* During vectorization remove existing clobber stmts.  */
8652           if (gimple_clobber_p (stmt))
8653             {
8654               unlink_stmt_vdef (stmt);
8655               gsi_remove (&si, true);
8656               release_defs (stmt);
8657             }
8658           else
8659             {
8660               stmt_info = loop_vinfo->lookup_stmt (stmt);
8661
8662               /* vector stmts created in the outer-loop during vectorization of
8663                  stmts in an inner-loop may not have a stmt_info, and do not
8664                  need to be vectorized.  */
8665               stmt_vec_info seen_store = NULL;
8666               if (stmt_info)
8667                 {
8668                   if (STMT_VINFO_IN_PATTERN_P (stmt_info))
8669                     {
8670                       gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
8671                       for (gimple_stmt_iterator subsi = gsi_start (def_seq);
8672                            !gsi_end_p (subsi); gsi_next (&subsi))
8673                         {
8674                           stmt_vec_info pat_stmt_info
8675                             = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
8676                           vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
8677                                                     &si, &seen_store);
8678                         }
8679                       stmt_vec_info pat_stmt_info
8680                         = STMT_VINFO_RELATED_STMT (stmt_info);
8681                       vect_transform_loop_stmt (loop_vinfo, pat_stmt_info, &si,
8682                                                 &seen_store);
8683                     }
8684                   vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
8685                                             &seen_store);
8686                 }
8687               gsi_next (&si);
8688               if (seen_store)
8689                 {
8690                   if (STMT_VINFO_GROUPED_ACCESS (seen_store))
8691                     /* Interleaving.  If IS_STORE is TRUE, the
8692                        vectorization of the interleaving chain was
8693                        completed - free all the stores in the chain.  */
8694                     vect_remove_stores (DR_GROUP_FIRST_ELEMENT (seen_store));
8695                   else
8696                     /* Free the attached stmt_vec_info and remove the stmt.  */
8697                     loop_vinfo->remove_stmt (stmt_info);
8698                 }
8699             }
8700         }
8701
8702       /* Stub out scalar statements that must not survive vectorization.
8703          Doing this here helps with grouped statements, or statements that
8704          are involved in patterns.  */
8705       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
8706            !gsi_end_p (gsi); gsi_next (&gsi))
8707         {
8708           gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
8709           if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
8710             {
8711               tree lhs = gimple_get_lhs (call);
8712               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8713                 {
8714                   tree zero = build_zero_cst (TREE_TYPE (lhs));
8715                   gimple *new_stmt = gimple_build_assign (lhs, zero);
8716                   gsi_replace (&gsi, new_stmt, true);
8717                 }
8718             }
8719         }
8720     }                           /* BBs in loop */
8721
8722   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
8723      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
8724   if (integer_onep (step_vector))
8725     niters_no_overflow = true;
8726   vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
8727                            niters_vector_mult_vf, !niters_no_overflow);
8728
8729   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
8730   scale_profile_for_vect_loop (loop, assumed_vf);
8731
8732   /* True if the final iteration might not handle a full vector's
8733      worth of scalar iterations.  */
8734   bool final_iter_may_be_partial = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8735   /* The minimum number of iterations performed by the epilogue.  This
8736      is 1 when peeling for gaps because we always need a final scalar
8737      iteration.  */
8738   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
8739   /* +1 to convert latch counts to loop iteration counts,
8740      -min_epilogue_iters to remove iterations that cannot be performed
8741        by the vector code.  */
8742   int bias_for_lowest = 1 - min_epilogue_iters;
8743   int bias_for_assumed = bias_for_lowest;
8744   int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
8745   if (alignment_npeels && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8746     {
8747       /* When the amount of peeling is known at compile time, the first
8748          iteration will have exactly alignment_npeels active elements.
8749          In the worst case it will have at least one.  */
8750       int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
8751       bias_for_lowest += lowest_vf - min_first_active;
8752       bias_for_assumed += assumed_vf - min_first_active;
8753     }
8754   /* In these calculations the "- 1" converts loop iteration counts
8755      back to latch counts.  */
8756   if (loop->any_upper_bound)
8757     loop->nb_iterations_upper_bound
8758       = (final_iter_may_be_partial
8759          ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
8760                           lowest_vf) - 1
8761          : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
8762                            lowest_vf) - 1);
8763   if (loop->any_likely_upper_bound)
8764     loop->nb_iterations_likely_upper_bound
8765       = (final_iter_may_be_partial
8766          ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
8767                           + bias_for_lowest, lowest_vf) - 1
8768          : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
8769                            + bias_for_lowest, lowest_vf) - 1);
8770   if (loop->any_estimate)
8771     loop->nb_iterations_estimate
8772       = (final_iter_may_be_partial
8773          ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
8774                           assumed_vf) - 1
8775          : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
8776                            assumed_vf) - 1);
8777
8778   if (dump_enabled_p ())
8779     {
8780       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8781         {
8782           dump_printf_loc (MSG_NOTE, vect_location,
8783                            "LOOP VECTORIZED\n");
8784           if (loop->inner)
8785             dump_printf_loc (MSG_NOTE, vect_location,
8786                              "OUTER LOOP VECTORIZED\n");
8787           dump_printf (MSG_NOTE, "\n");
8788         }
8789       else
8790         {
8791           dump_printf_loc (MSG_NOTE, vect_location,
8792                            "LOOP EPILOGUE VECTORIZED (VS=");
8793           dump_dec (MSG_NOTE, current_vector_size);
8794           dump_printf (MSG_NOTE, ")\n");
8795         }
8796     }
8797
8798   /* Loops vectorized with a variable factor won't benefit from
8799      unrolling/peeling.  */
8800   if (!vf.is_constant ())
8801     {
8802       loop->unroll = 1;
8803       if (dump_enabled_p ())
8804         dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
8805                          " variable-length vectorization factor\n");
8806     }
8807   /* Free SLP instances here because otherwise stmt reference counting
8808      won't work.  */
8809   slp_instance instance;
8810   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
8811     vect_free_slp_instance (instance, true);
8812   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
8813   /* Clear-up safelen field since its value is invalid after vectorization
8814      since vectorized loop can have loop-carried dependencies.  */
8815   loop->safelen = 0;
8816
8817   /* Don't vectorize epilogue for epilogue.  */
8818   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8819     epilogue = NULL;
8820
8821   if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
8822     epilogue = NULL;
8823
8824   if (epilogue)
8825     {
8826       auto_vector_sizes vector_sizes;
8827       targetm.vectorize.autovectorize_vector_sizes (&vector_sizes, false);
8828       unsigned int next_size = 0;
8829
8830       /* Note LOOP_VINFO_NITERS_KNOWN_P and LOOP_VINFO_INT_NITERS work
8831          on niters already ajusted for the iterations of the prologue.  */
8832       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8833           && known_eq (vf, lowest_vf))
8834         {
8835           unsigned HOST_WIDE_INT eiters
8836             = (LOOP_VINFO_INT_NITERS (loop_vinfo)
8837                - LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
8838           eiters
8839             = eiters % lowest_vf + LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
8840           epilogue->nb_iterations_upper_bound = eiters - 1;
8841           epilogue->any_upper_bound = true;
8842
8843           unsigned int ratio;
8844           while (next_size < vector_sizes.length ()
8845                  && !(constant_multiple_p (current_vector_size,
8846                                            vector_sizes[next_size], &ratio)
8847                       && eiters >= lowest_vf / ratio))
8848             next_size += 1;
8849         }
8850       else
8851         while (next_size < vector_sizes.length ()
8852                && maybe_lt (current_vector_size, vector_sizes[next_size]))
8853           next_size += 1;
8854
8855       if (next_size == vector_sizes.length ())
8856         epilogue = NULL;
8857     }
8858
8859   if (epilogue)
8860     {
8861       epilogue->force_vectorize = loop->force_vectorize;
8862       epilogue->safelen = loop->safelen;
8863       epilogue->dont_vectorize = false;
8864
8865       /* We may need to if-convert epilogue to vectorize it.  */
8866       if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8867         tree_if_conversion (epilogue);
8868     }
8869
8870   return epilogue;
8871 }
8872
8873 /* The code below is trying to perform simple optimization - revert
8874    if-conversion for masked stores, i.e. if the mask of a store is zero
8875    do not perform it and all stored value producers also if possible.
8876    For example,
8877      for (i=0; i<n; i++)
8878        if (c[i])
8879         {
8880           p1[i] += 1;
8881           p2[i] = p3[i] +2;
8882         }
8883    this transformation will produce the following semi-hammock:
8884
8885    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
8886      {
8887        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
8888        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
8889        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
8890        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
8891        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
8892        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
8893      }
8894 */
8895
8896 void
8897 optimize_mask_stores (struct loop *loop)
8898 {
8899   basic_block *bbs = get_loop_body (loop);
8900   unsigned nbbs = loop->num_nodes;
8901   unsigned i;
8902   basic_block bb;
8903   struct loop *bb_loop;
8904   gimple_stmt_iterator gsi;
8905   gimple *stmt;
8906   auto_vec<gimple *> worklist;
8907   auto_purge_vect_location sentinel;
8908
8909   vect_location = find_loop_location (loop);
8910   /* Pick up all masked stores in loop if any.  */
8911   for (i = 0; i < nbbs; i++)
8912     {
8913       bb = bbs[i];
8914       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
8915            gsi_next (&gsi))
8916         {
8917           stmt = gsi_stmt (gsi);
8918           if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
8919             worklist.safe_push (stmt);
8920         }
8921     }
8922
8923   free (bbs);
8924   if (worklist.is_empty ())
8925     return;
8926
8927   /* Loop has masked stores.  */
8928   while (!worklist.is_empty ())
8929     {
8930       gimple *last, *last_store;
8931       edge e, efalse;
8932       tree mask;
8933       basic_block store_bb, join_bb;
8934       gimple_stmt_iterator gsi_to;
8935       tree vdef, new_vdef;
8936       gphi *phi;
8937       tree vectype;
8938       tree zero;
8939
8940       last = worklist.pop ();
8941       mask = gimple_call_arg (last, 2);
8942       bb = gimple_bb (last);
8943       /* Create then_bb and if-then structure in CFG, then_bb belongs to
8944          the same loop as if_bb.  It could be different to LOOP when two
8945          level loop-nest is vectorized and mask_store belongs to the inner
8946          one.  */
8947       e = split_block (bb, last);
8948       bb_loop = bb->loop_father;
8949       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
8950       join_bb = e->dest;
8951       store_bb = create_empty_bb (bb);
8952       add_bb_to_loop (store_bb, bb_loop);
8953       e->flags = EDGE_TRUE_VALUE;
8954       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
8955       /* Put STORE_BB to likely part.  */
8956       efalse->probability = profile_probability::unlikely ();
8957       store_bb->count = efalse->count ();
8958       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
8959       if (dom_info_available_p (CDI_DOMINATORS))
8960         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
8961       if (dump_enabled_p ())
8962         dump_printf_loc (MSG_NOTE, vect_location,
8963                          "Create new block %d to sink mask stores.",
8964                          store_bb->index);
8965       /* Create vector comparison with boolean result.  */
8966       vectype = TREE_TYPE (mask);
8967       zero = build_zero_cst (vectype);
8968       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
8969       gsi = gsi_last_bb (bb);
8970       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
8971       /* Create new PHI node for vdef of the last masked store:
8972          .MEM_2 = VDEF <.MEM_1>
8973          will be converted to
8974          .MEM.3 = VDEF <.MEM_1>
8975          and new PHI node will be created in join bb
8976          .MEM_2 = PHI <.MEM_1, .MEM_3>
8977       */
8978       vdef = gimple_vdef (last);
8979       new_vdef = make_ssa_name (gimple_vop (cfun), last);
8980       gimple_set_vdef (last, new_vdef);
8981       phi = create_phi_node (vdef, join_bb);
8982       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
8983
8984       /* Put all masked stores with the same mask to STORE_BB if possible.  */
8985       while (true)
8986         {
8987           gimple_stmt_iterator gsi_from;
8988           gimple *stmt1 = NULL;
8989
8990           /* Move masked store to STORE_BB.  */
8991           last_store = last;
8992           gsi = gsi_for_stmt (last);
8993           gsi_from = gsi;
8994           /* Shift GSI to the previous stmt for further traversal.  */
8995           gsi_prev (&gsi);
8996           gsi_to = gsi_start_bb (store_bb);
8997           gsi_move_before (&gsi_from, &gsi_to);
8998           /* Setup GSI_TO to the non-empty block start.  */
8999           gsi_to = gsi_start_bb (store_bb);
9000           if (dump_enabled_p ())
9001             dump_printf_loc (MSG_NOTE, vect_location,
9002                              "Move stmt to created bb\n%G", last);
9003           /* Move all stored value producers if possible.  */
9004           while (!gsi_end_p (gsi))
9005             {
9006               tree lhs;
9007               imm_use_iterator imm_iter;
9008               use_operand_p use_p;
9009               bool res;
9010
9011               /* Skip debug statements.  */
9012               if (is_gimple_debug (gsi_stmt (gsi)))
9013                 {
9014                   gsi_prev (&gsi);
9015                   continue;
9016                 }
9017               stmt1 = gsi_stmt (gsi);
9018               /* Do not consider statements writing to memory or having
9019                  volatile operand.  */
9020               if (gimple_vdef (stmt1)
9021                   || gimple_has_volatile_ops (stmt1))
9022                 break;
9023               gsi_from = gsi;
9024               gsi_prev (&gsi);
9025               lhs = gimple_get_lhs (stmt1);
9026               if (!lhs)
9027                 break;
9028
9029               /* LHS of vectorized stmt must be SSA_NAME.  */
9030               if (TREE_CODE (lhs) != SSA_NAME)
9031                 break;
9032
9033               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9034                 {
9035                   /* Remove dead scalar statement.  */
9036                   if (has_zero_uses (lhs))
9037                     {
9038                       gsi_remove (&gsi_from, true);
9039                       continue;
9040                     }
9041                 }
9042
9043               /* Check that LHS does not have uses outside of STORE_BB.  */
9044               res = true;
9045               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
9046                 {
9047                   gimple *use_stmt;
9048                   use_stmt = USE_STMT (use_p);
9049                   if (is_gimple_debug (use_stmt))
9050                     continue;
9051                   if (gimple_bb (use_stmt) != store_bb)
9052                     {
9053                       res = false;
9054                       break;
9055                     }
9056                 }
9057               if (!res)
9058                 break;
9059
9060               if (gimple_vuse (stmt1)
9061                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
9062                 break;
9063
9064               /* Can move STMT1 to STORE_BB.  */
9065               if (dump_enabled_p ())
9066                 dump_printf_loc (MSG_NOTE, vect_location,
9067                                  "Move stmt to created bb\n%G", stmt1);
9068               gsi_move_before (&gsi_from, &gsi_to);
9069               /* Shift GSI_TO for further insertion.  */
9070               gsi_prev (&gsi_to);
9071             }
9072           /* Put other masked stores with the same mask to STORE_BB.  */
9073           if (worklist.is_empty ()
9074               || gimple_call_arg (worklist.last (), 2) != mask
9075               || worklist.last () != stmt1)
9076             break;
9077           last = worklist.pop ();
9078         }
9079       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
9080     }
9081 }
9082
9083 /* Decide whether it is possible to use a zero-based induction variable
9084    when vectorizing LOOP_VINFO with a fully-masked loop.  If it is,
9085    return the value that the induction variable must be able to hold
9086    in order to ensure that the loop ends with an all-false mask.
9087    Return -1 otherwise.  */
9088 widest_int
9089 vect_iv_limit_for_full_masking (loop_vec_info loop_vinfo)
9090 {
9091   tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9092   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9093   unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
9094
9095   /* Calculate the value that the induction variable must be able
9096      to hit in order to ensure that we end the loop with an all-false mask.
9097      This involves adding the maximum number of inactive trailing scalar
9098      iterations.  */
9099   widest_int iv_limit = -1;
9100   if (max_loop_iterations (loop, &iv_limit))
9101     {
9102       if (niters_skip)
9103         {
9104           /* Add the maximum number of skipped iterations to the
9105              maximum iteration count.  */
9106           if (TREE_CODE (niters_skip) == INTEGER_CST)
9107             iv_limit += wi::to_widest (niters_skip);
9108           else
9109             iv_limit += max_vf - 1;
9110         }
9111       else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
9112         /* Make a conservatively-correct assumption.  */
9113         iv_limit += max_vf - 1;
9114
9115       /* IV_LIMIT is the maximum number of latch iterations, which is also
9116          the maximum in-range IV value.  Round this value down to the previous
9117          vector alignment boundary and then add an extra full iteration.  */
9118       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9119       iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
9120     }
9121   return iv_limit;
9122 }
9123