gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2018 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "cfghooks.h"
  31 #include "tree-pass.h"
  32 #include "ssa.h"
  33 #include "optabs-tree.h"
  34 #include "diagnostic-core.h"
  35 #include "fold-const.h"
  36 #include "stor-layout.h"
  37 #include "cfganal.h"
  38 #include "gimplify.h"
  39 #include "gimple-iterator.h"
  40 #include "gimplify-me.h"
  41 #include "tree-ssa-loop-ivopts.h"
  42 #include "tree-ssa-loop-manip.h"
  43 #include "tree-ssa-loop-niter.h"
  44 #include "tree-ssa-loop.h"
  45 #include "cfgloop.h"
  46 #include "params.h"
  47 #include "tree-scalar-evolution.h"
  48 #include "tree-vectorizer.h"
  49 #include "gimple-fold.h"
  50 #include "cgraph.h"
  51 #include "tree-cfg.h"
  52 #include "tree-if-conv.h"
  53 #include "internal-fn.h"
  54 #include "tree-vector-builder.h"
  55 #include "vec-perm-indices.h"
  56 #include "tree-eh.h"
  57
  58 /* Loop Vectorization Pass.
  59
  60    This pass tries to vectorize loops.
  61
  62    For example, the vectorizer transforms the following simple loop:
  63
  64         short a[N]; short b[N]; short c[N]; int i;
  65
  66         for (i=0; i<N; i++){
  67           a[i] = b[i] + c[i];
  68         }
  69
  70    as if it was manually vectorized by rewriting the source code into:
  71
  72         typedef int __attribute__((mode(V8HI))) v8hi;
  73         short a[N];  short b[N]; short c[N];   int i;
  74         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  75         v8hi va, vb, vc;
  76
  77         for (i=0; i<N/8; i++){
  78           vb = pb[i];
  79           vc = pc[i];
  80           va = vb + vc;
  81           pa[i] = va;
  82         }
  83
  84         The main entry to this pass is vectorize_loops(), in which
  85    the vectorizer applies a set of analyses on a given set of loops,
  86    followed by the actual vectorization transformation for the loops that
  87    had successfully passed the analysis phase.
  88         Throughout this pass we make a distinction between two types of
  89    data: scalars (which are represented by SSA_NAMES), and memory references
  90    ("data-refs").  These two types of data require different handling both
  91    during analysis and transformation. The types of data-refs that the
  92    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  93    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  94    accesses are required to have a simple (consecutive) access pattern.
  95
  96    Analysis phase:
  97    ===============
  98         The driver for the analysis phase is vect_analyze_loop().
  99    It applies a set of analyses, some of which rely on the scalar evolution
 100    analyzer (scev) developed by Sebastian Pop.
 101
 102         During the analysis phase the vectorizer records some information
 103    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 104    loop, as well as general information about the loop as a whole, which is
 105    recorded in a "loop_vec_info" struct attached to each loop.
 106
 107    Transformation phase:
 108    =====================
 109         The loop transformation phase scans all the stmts in the loop, and
 110    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 111    the loop that needs to be vectorized.  It inserts the vector code sequence
 112    just before the scalar stmt S, and records a pointer to the vector code
 113    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 114    attached to S).  This pointer will be used for the vectorization of following
 115    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 116    otherwise, we rely on dead code elimination for removing it.
 117
 118         For example, say stmt S1 was vectorized into stmt VS1:
 119
 120    VS1: vb = px[i];
 121    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 122    S2:  a = b;
 123
 124    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 125    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 126    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 127    resulting sequence would be:
 128
 129    VS1: vb = px[i];
 130    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 131    VS2: va = vb;
 132    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 133
 134         Operands that are not SSA_NAMEs, are data-refs that appear in
 135    load/store operations (like 'x[i]' in S1), and are handled differently.
 136
 137    Target modeling:
 138    =================
 139         Currently the only target specific information that is used is the
 140    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 141    Targets that can support different sizes of vectors, for now will need
 142    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 143    flexibility will be added in the future.
 144
 145         Since we only vectorize operations which vector form can be
 146    expressed using existing tree codes, to verify that an operation is
 147    supported, the vectorizer checks the relevant optab at the relevant
 148    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 149    the value found is CODE_FOR_nothing, then there's no target support, and
 150    we can't vectorize the stmt.
 151
 152    For additional information on this project see:
 153    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 154 */
 155
 156 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 157
 158 /* Subroutine of vect_determine_vf_for_stmt that handles only one
 159    statement.  VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
 160    may already be set for general statements (not just data refs).  */
 161
 162 static opt_result
 163 vect_determine_vf_for_stmt_1 (stmt_vec_info stmt_info,
 164                               bool vectype_maybe_set_p,
 165                               poly_uint64 *vf,
 166                               vec<stmt_vec_info > *mask_producers)
 167 {
 168   gimple *stmt = stmt_info->stmt;
 169
 170   if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 171        && !STMT_VINFO_LIVE_P (stmt_info))
 172       || gimple_clobber_p (stmt))
 173     {
 174       if (dump_enabled_p ())
 175         dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 176       return opt_result::success ();
 177     }
 178
 179   tree stmt_vectype, nunits_vectype;
 180   opt_result res = vect_get_vector_types_for_stmt (stmt_info, &stmt_vectype,
 181                                                    &nunits_vectype);
 182   if (!res)
 183     return res;
 184
 185   if (stmt_vectype)
 186     {
 187       if (STMT_VINFO_VECTYPE (stmt_info))
 188         /* The only case when a vectype had been already set is for stmts
 189            that contain a data ref, or for "pattern-stmts" (stmts generated
 190            by the vectorizer to represent/replace a certain idiom).  */
 191         gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
 192                      || vectype_maybe_set_p)
 193                     && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
 194       else if (stmt_vectype == boolean_type_node)
 195         mask_producers->safe_push (stmt_info);
 196       else
 197         STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
 198     }
 199
 200   if (nunits_vectype)
 201     vect_update_max_nunits (vf, nunits_vectype);
 202
 203   return opt_result::success ();
 204 }
 205
 206 /* Subroutine of vect_determine_vectorization_factor.  Set the vector
 207    types of STMT_INFO and all attached pattern statements and update
 208    the vectorization factor VF accordingly.  If some of the statements
 209    produce a mask result whose vector type can only be calculated later,
 210    add them to MASK_PRODUCERS.  Return true on success or false if
 211    something prevented vectorization.  */
 212
 213 static opt_result
 214 vect_determine_vf_for_stmt (stmt_vec_info stmt_info, poly_uint64 *vf,
 215                             vec<stmt_vec_info > *mask_producers)
 216 {
 217   vec_info *vinfo = stmt_info->vinfo;
 218   if (dump_enabled_p ())
 219     dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
 220                      stmt_info->stmt);
 221   opt_result res
 222     = vect_determine_vf_for_stmt_1 (stmt_info, false, vf, mask_producers);
 223   if (!res)
 224     return res;
 225
 226   if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 227       && STMT_VINFO_RELATED_STMT (stmt_info))
 228     {
 229       gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 230       stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
 231
 232       /* If a pattern statement has def stmts, analyze them too.  */
 233       for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
 234            !gsi_end_p (si); gsi_next (&si))
 235         {
 236           stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
 237           if (dump_enabled_p ())
 238             dump_printf_loc (MSG_NOTE, vect_location,
 239                              "==> examining pattern def stmt: %G",
 240                              def_stmt_info->stmt);
 241           if (!vect_determine_vf_for_stmt_1 (def_stmt_info, true,
 242                                              vf, mask_producers))
 243           res = vect_determine_vf_for_stmt_1 (def_stmt_info, true,
 244                                               vf, mask_producers);
 245           if (!res)
 246             return res;
 247         }
 248
 249       if (dump_enabled_p ())
 250         dump_printf_loc (MSG_NOTE, vect_location,
 251                          "==> examining pattern statement: %G",
 252                          stmt_info->stmt);
 253       res = vect_determine_vf_for_stmt_1 (stmt_info, true, vf, mask_producers);
 254       if (!res)
 255         return res;
 256     }
 257
 258   return opt_result::success ();
 259 }
 260
 261 /* Function vect_determine_vectorization_factor
 262
 263    Determine the vectorization factor (VF).  VF is the number of data elements
 264    that are operated upon in parallel in a single iteration of the vectorized
 265    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 266    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 267    elements can fit in a single vector register.
 268
 269    We currently support vectorization of loops in which all types operated upon
 270    are of the same size.  Therefore this function currently sets VF according to
 271    the size of the types operated upon, and fails if there are multiple sizes
 272    in the loop.
 273
 274    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 275    original loop:
 276         for (i=0; i<N; i++){
 277           a[i] = b[i] + c[i];
 278         }
 279
 280    vectorized loop:
 281         for (i=0; i<N; i+=VF){
 282           a[i:VF] = b[i:VF] + c[i:VF];
 283         }
 284 */
 285
 286 static opt_result
 287 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 288 {
 289   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 290   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 291   unsigned nbbs = loop->num_nodes;
 292   poly_uint64 vectorization_factor = 1;
 293   tree scalar_type = NULL_TREE;
 294   gphi *phi;
 295   tree vectype;
 296   stmt_vec_info stmt_info;
 297   unsigned i;
 298   auto_vec<stmt_vec_info> mask_producers;
 299
 300   DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
 301
 302   for (i = 0; i < nbbs; i++)
 303     {
 304       basic_block bb = bbs[i];
 305
 306       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 307            gsi_next (&si))
 308         {
 309           phi = si.phi ();
 310           stmt_info = loop_vinfo->lookup_stmt (phi);
 311           if (dump_enabled_p ())
 312             dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
 313                              phi);
 314
 315           gcc_assert (stmt_info);
 316
 317           if (STMT_VINFO_RELEVANT_P (stmt_info)
 318               || STMT_VINFO_LIVE_P (stmt_info))
 319             {
 320               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 321               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 322
 323               if (dump_enabled_p ())
 324                 dump_printf_loc (MSG_NOTE, vect_location,
 325                                  "get vectype for scalar type:  %T\n",
 326                                  scalar_type);
 327
 328               vectype = get_vectype_for_scalar_type (scalar_type);
 329               if (!vectype)
 330                 return opt_result::failure_at (phi,
 331                                                "not vectorized: unsupported "
 332                                                "data-type %T\n",
 333                                                scalar_type);
 334               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 335
 336               if (dump_enabled_p ())
 337                 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
 338                                  vectype);
 339
 340               if (dump_enabled_p ())
 341                 {
 342                   dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
 343                   dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
 344                   dump_printf (MSG_NOTE, "\n");
 345                 }
 346
 347               vect_update_max_nunits (&vectorization_factor, vectype);
 348             }
 349         }
 350
 351       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
 352            gsi_next (&si))
 353         {
 354           stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
 355           opt_result res
 356             = vect_determine_vf_for_stmt (stmt_info, &vectorization_factor,
 357                                           &mask_producers);
 358           if (!res)
 359             return res;
 360         }
 361     }
 362
 363   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 364   if (dump_enabled_p ())
 365     {
 366       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
 367       dump_dec (MSG_NOTE, vectorization_factor);
 368       dump_printf (MSG_NOTE, "\n");
 369     }
 370
 371   if (known_le (vectorization_factor, 1U))
 372     return opt_result::failure_at (vect_location,
 373                                    "not vectorized: unsupported data-type\n");
 374   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 375
 376   for (i = 0; i < mask_producers.length (); i++)
 377     {
 378       stmt_info = mask_producers[i];
 379       opt_tree mask_type = vect_get_mask_type_for_stmt (stmt_info);
 380       if (!mask_type)
 381         return opt_result::propagate_failure (mask_type);
 382       STMT_VINFO_VECTYPE (stmt_info) = mask_type;
 383     }
 384
 385   return opt_result::success ();
 386 }
 387
 388
 389 /* Function vect_is_simple_iv_evolution.
 390
 391    FORNOW: A simple evolution of an induction variables in the loop is
 392    considered a polynomial evolution.  */
 393
 394 static bool
 395 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 396                              tree * step)
 397 {
 398   tree init_expr;
 399   tree step_expr;
 400   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 401   basic_block bb;
 402
 403   /* When there is no evolution in this loop, the evolution function
 404      is not "simple".  */
 405   if (evolution_part == NULL_TREE)
 406     return false;
 407
 408   /* When the evolution is a polynomial of degree >= 2
 409      the evolution function is not "simple".  */
 410   if (tree_is_chrec (evolution_part))
 411     return false;
 412
 413   step_expr = evolution_part;
 414   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 415
 416   if (dump_enabled_p ())
 417     dump_printf_loc (MSG_NOTE, vect_location, "step: %T,  init: %T\n",
 418                      step_expr, init_expr);
 419
 420   *init = init_expr;
 421   *step = step_expr;
 422
 423   if (TREE_CODE (step_expr) != INTEGER_CST
 424       && (TREE_CODE (step_expr) != SSA_NAME
 425           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 426               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 427           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 428               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 429                   || !flag_associative_math)))
 430       && (TREE_CODE (step_expr) != REAL_CST
 431           || !flag_associative_math))
 432     {
 433       if (dump_enabled_p ())
 434         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 435                          "step unknown.\n");
 436       return false;
 437     }
 438
 439   return true;
 440 }
 441
 442 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
 443    what we are assuming is a double reduction.  For example, given
 444    a structure like this:
 445
 446       outer1:
 447         x_1 = PHI <x_4(outer2), ...>;
 448         ...
 449
 450       inner:
 451         x_2 = PHI <x_1(outer1), ...>;
 452         ...
 453         x_3 = ...;
 454         ...
 455
 456       outer2:
 457         x_4 = PHI <x_3(inner)>;
 458         ...
 459
 460    outer loop analysis would treat x_1 as a double reduction phi and
 461    this function would then return true for x_2.  */
 462
 463 static bool
 464 vect_inner_phi_in_double_reduction_p (stmt_vec_info stmt_info, gphi *phi)
 465 {
 466   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
 467   use_operand_p use_p;
 468   ssa_op_iter op_iter;
 469   FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
 470     if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
 471       if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
 472         return true;
 473   return false;
 474 }
 475
 476 /* Function vect_analyze_scalar_cycles_1.
 477
 478    Examine the cross iteration def-use cycles of scalar variables
 479    in LOOP.  LOOP_VINFO represents the loop that is now being
 480    considered for vectorization (can be LOOP, or an outer-loop
 481    enclosing LOOP).  */
 482
 483 static void
 484 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 485 {
 486   basic_block bb = loop->header;
 487   tree init, step;
 488   auto_vec<stmt_vec_info, 64> worklist;
 489   gphi_iterator gsi;
 490   bool double_reduc;
 491
 492   DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
 493
 494   /* First - identify all inductions.  Reduction detection assumes that all the
 495      inductions have been identified, therefore, this order must not be
 496      changed.  */
 497   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 498     {
 499       gphi *phi = gsi.phi ();
 500       tree access_fn = NULL;
 501       tree def = PHI_RESULT (phi);
 502       stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
 503
 504       if (dump_enabled_p ())
 505         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
 506
 507       /* Skip virtual phi's.  The data dependences that are associated with
 508          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 509       if (virtual_operand_p (def))
 510         continue;
 511
 512       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 513
 514       /* Analyze the evolution function.  */
 515       access_fn = analyze_scalar_evolution (loop, def);
 516       if (access_fn)
 517         {
 518           STRIP_NOPS (access_fn);
 519           if (dump_enabled_p ())
 520             dump_printf_loc (MSG_NOTE, vect_location,
 521                              "Access function of PHI: %T\n", access_fn);
 522           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 523             = initial_condition_in_loop_num (access_fn, loop->num);
 524           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 525             = evolution_part_in_loop_num (access_fn, loop->num);
 526         }
 527
 528       if (!access_fn
 529           || vect_inner_phi_in_double_reduction_p (stmt_vinfo, phi)
 530           || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
 531           || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 532               && TREE_CODE (step) != INTEGER_CST))
 533         {
 534           worklist.safe_push (stmt_vinfo);
 535           continue;
 536         }
 537
 538       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 539                   != NULL_TREE);
 540       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 541
 542       if (dump_enabled_p ())
 543         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 544       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 545     }
 546
 547
 548   /* Second - identify all reductions and nested cycles.  */
 549   while (worklist.length () > 0)
 550     {
 551       stmt_vec_info stmt_vinfo = worklist.pop ();
 552       gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
 553       tree def = PHI_RESULT (phi);
 554
 555       if (dump_enabled_p ())
 556         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
 557
 558       gcc_assert (!virtual_operand_p (def)
 559                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 560
 561       stmt_vec_info reduc_stmt_info
 562         = vect_force_simple_reduction (loop_vinfo, stmt_vinfo,
 563                                        &double_reduc, false);
 564       if (reduc_stmt_info)
 565         {
 566           if (double_reduc)
 567             {
 568               if (dump_enabled_p ())
 569                 dump_printf_loc (MSG_NOTE, vect_location,
 570                                  "Detected double reduction.\n");
 571
 572               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 573               STMT_VINFO_DEF_TYPE (reduc_stmt_info)
 574                 = vect_double_reduction_def;
 575             }
 576           else
 577             {
 578               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
 579                 {
 580                   if (dump_enabled_p ())
 581                     dump_printf_loc (MSG_NOTE, vect_location,
 582                                      "Detected vectorizable nested cycle.\n");
 583
 584                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 585                   STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_nested_cycle;
 586                 }
 587               else
 588                 {
 589                   if (dump_enabled_p ())
 590                     dump_printf_loc (MSG_NOTE, vect_location,
 591                                      "Detected reduction.\n");
 592
 593                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 594                   STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
 595                   /* Store the reduction cycles for possible vectorization in
 596                      loop-aware SLP if it was not detected as reduction
 597                      chain.  */
 598                   if (! REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
 599                     LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
 600                       (reduc_stmt_info);
 601                 }
 602             }
 603         }
 604       else
 605         if (dump_enabled_p ())
 606           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 607                            "Unknown def-use cycle pattern.\n");
 608     }
 609 }
 610
 611
 612 /* Function vect_analyze_scalar_cycles.
 613
 614    Examine the cross iteration def-use cycles of scalar variables, by
 615    analyzing the loop-header PHIs of scalar variables.  Classify each
 616    cycle as one of the following: invariant, induction, reduction, unknown.
 617    We do that for the loop represented by LOOP_VINFO, and also to its
 618    inner-loop, if exists.
 619    Examples for scalar cycles:
 620
 621    Example1: reduction:
 622
 623               loop1:
 624               for (i=0; i<N; i++)
 625                  sum += a[i];
 626
 627    Example2: induction:
 628
 629               loop2:
 630               for (i=0; i<N; i++)
 631                  a[i] = i;  */
 632
 633 static void
 634 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 635 {
 636   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 637
 638   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 639
 640   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 641      Reductions in such inner-loop therefore have different properties than
 642      the reductions in the nest that gets vectorized:
 643      1. When vectorized, they are executed in the same order as in the original
 644         scalar loop, so we can't change the order of computation when
 645         vectorizing them.
 646      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 647         current checks are too strict.  */
 648
 649   if (loop->inner)
 650     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 651 }
 652
 653 /* Transfer group and reduction information from STMT_INFO to its
 654    pattern stmt.  */
 655
 656 static void
 657 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
 658 {
 659   stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
 660   stmt_vec_info stmtp;
 661   gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
 662               && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
 663   REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
 664   do
 665     {
 666       stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
 667       REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
 668       stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
 669       if (stmt_info)
 670         REDUC_GROUP_NEXT_ELEMENT (stmtp)
 671           = STMT_VINFO_RELATED_STMT (stmt_info);
 672     }
 673   while (stmt_info);
 674   STMT_VINFO_DEF_TYPE (stmtp) = vect_reduction_def;
 675 }
 676
 677 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 678
 679 static void
 680 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 681 {
 682   stmt_vec_info first;
 683   unsigned i;
 684
 685   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 686     if (STMT_VINFO_IN_PATTERN_P (first))
 687       {
 688         stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
 689         while (next)
 690           {
 691             if (! STMT_VINFO_IN_PATTERN_P (next))
 692               break;
 693             next = REDUC_GROUP_NEXT_ELEMENT (next);
 694           }
 695         /* If not all stmt in the chain are patterns try to handle
 696            the chain without patterns.  */
 697         if (! next)
 698           {
 699             vect_fixup_reduc_chain (first);
 700             LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
 701               = STMT_VINFO_RELATED_STMT (first);
 702           }
 703       }
 704 }
 705
 706 /* Function vect_get_loop_niters.
 707
 708    Determine how many iterations the loop is executed and place it
 709    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
 710    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
 711    niter information holds in ASSUMPTIONS.
 712
 713    Return the loop exit condition.  */
 714
 715
 716 static gcond *
 717 vect_get_loop_niters (struct loop *loop, tree *assumptions,
 718                       tree *number_of_iterations, tree *number_of_iterationsm1)
 719 {
 720   edge exit = single_exit (loop);
 721   struct tree_niter_desc niter_desc;
 722   tree niter_assumptions, niter, may_be_zero;
 723   gcond *cond = get_loop_exit_condition (loop);
 724
 725   *assumptions = boolean_true_node;
 726   *number_of_iterationsm1 = chrec_dont_know;
 727   *number_of_iterations = chrec_dont_know;
 728   DUMP_VECT_SCOPE ("get_loop_niters");
 729
 730   if (!exit)
 731     return cond;
 732
 733   niter = chrec_dont_know;
 734   may_be_zero = NULL_TREE;
 735   niter_assumptions = boolean_true_node;
 736   if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
 737       || chrec_contains_undetermined (niter_desc.niter))
 738     return cond;
 739
 740   niter_assumptions = niter_desc.assumptions;
 741   may_be_zero = niter_desc.may_be_zero;
 742   niter = niter_desc.niter;
 743
 744   if (may_be_zero && integer_zerop (may_be_zero))
 745     may_be_zero = NULL_TREE;
 746
 747   if (may_be_zero)
 748     {
 749       if (COMPARISON_CLASS_P (may_be_zero))
 750         {
 751           /* Try to combine may_be_zero with assumptions, this can simplify
 752              computation of niter expression.  */
 753           if (niter_assumptions && !integer_nonzerop (niter_assumptions))
 754             niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
 755                                              niter_assumptions,
 756                                              fold_build1 (TRUTH_NOT_EXPR,
 757                                                           boolean_type_node,
 758                                                           may_be_zero));
 759           else
 760             niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
 761                                  build_int_cst (TREE_TYPE (niter), 0),
 762                                  rewrite_to_non_trapping_overflow (niter));
 763
 764           may_be_zero = NULL_TREE;
 765         }
 766       else if (integer_nonzerop (may_be_zero))
 767         {
 768           *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
 769           *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
 770           return cond;
 771         }
 772       else
 773         return cond;
 774     }
 775
 776   *assumptions = niter_assumptions;
 777   *number_of_iterationsm1 = niter;
 778
 779   /* We want the number of loop header executions which is the number
 780      of latch executions plus one.
 781      ???  For UINT_MAX latch executions this number overflows to zero
 782      for loops like do { n++; } while (n != 0);  */
 783   if (niter && !chrec_contains_undetermined (niter))
 784     niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
 785                           build_int_cst (TREE_TYPE (niter), 1));
 786   *number_of_iterations = niter;
 787
 788   return cond;
 789 }
 790
 791 /* Function bb_in_loop_p
 792
 793    Used as predicate for dfs order traversal of the loop bbs.  */
 794
 795 static bool
 796 bb_in_loop_p (const_basic_block bb, const void *data)
 797 {
 798   const struct loop *const loop = (const struct loop *)data;
 799   if (flow_bb_inside_loop_p (loop, bb))
 800     return true;
 801   return false;
 802 }
 803
 804
 805 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
 806    stmt_vec_info structs for all the stmts in LOOP_IN.  */
 807
 808 _loop_vec_info::_loop_vec_info (struct loop *loop_in, vec_info_shared *shared)
 809   : vec_info (vec_info::loop, init_cost (loop_in), shared),
 810     loop (loop_in),
 811     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
 812     num_itersm1 (NULL_TREE),
 813     num_iters (NULL_TREE),
 814     num_iters_unchanged (NULL_TREE),
 815     num_iters_assumptions (NULL_TREE),
 816     th (0),
 817     versioning_threshold (0),
 818     vectorization_factor (0),
 819     max_vectorization_factor (0),
 820     mask_skip_niters (NULL_TREE),
 821     mask_compare_type (NULL_TREE),
 822     unaligned_dr (NULL),
 823     peeling_for_alignment (0),
 824     ptr_mask (0),
 825     ivexpr_map (NULL),
 826     slp_unrolling_factor (1),
 827     single_scalar_iteration_cost (0),
 828     vectorizable (false),
 829     can_fully_mask_p (true),
 830     fully_masked_p (false),
 831     peeling_for_gaps (false),
 832     peeling_for_niter (false),
 833     operands_swapped (false),
 834     no_data_dependencies (false),
 835     has_mask_store (false),
 836     scalar_loop (NULL),
 837     orig_loop_info (NULL)
 838 {
 839   /* CHECKME: We want to visit all BBs before their successors (except for
 840      latch blocks, for which this assertion wouldn't hold).  In the simple
 841      case of the loop forms we allow, a dfs order of the BBs would the same
 842      as reversed postorder traversal, so we are safe.  */
 843
 844   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 845                                           bbs, loop->num_nodes, loop);
 846   gcc_assert (nbbs == loop->num_nodes);
 847
 848   for (unsigned int i = 0; i < nbbs; i++)
 849     {
 850       basic_block bb = bbs[i];
 851       gimple_stmt_iterator si;
 852
 853       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 854         {
 855           gimple *phi = gsi_stmt (si);
 856           gimple_set_uid (phi, 0);
 857           add_stmt (phi);
 858         }
 859
 860       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 861         {
 862           gimple *stmt = gsi_stmt (si);
 863           gimple_set_uid (stmt, 0);
 864           add_stmt (stmt);
 865         }
 866     }
 867 }
 868
 869 /* Free all levels of MASKS.  */
 870
 871 void
 872 release_vec_loop_masks (vec_loop_masks *masks)
 873 {
 874   rgroup_masks *rgm;
 875   unsigned int i;
 876   FOR_EACH_VEC_ELT (*masks, i, rgm)
 877     rgm->masks.release ();
 878   masks->release ();
 879 }
 880
 881 /* Free all memory used by the _loop_vec_info, as well as all the
 882    stmt_vec_info structs of all the stmts in the loop.  */
 883
 884 _loop_vec_info::~_loop_vec_info ()
 885 {
 886   int nbbs;
 887   gimple_stmt_iterator si;
 888   int j;
 889
 890   nbbs = loop->num_nodes;
 891   for (j = 0; j < nbbs; j++)
 892     {
 893       basic_block bb = bbs[j];
 894       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
 895         {
 896           gimple *stmt = gsi_stmt (si);
 897
 898           /* We may have broken canonical form by moving a constant
 899              into RHS1 of a commutative op.  Fix such occurrences.  */
 900           if (operands_swapped && is_gimple_assign (stmt))
 901             {
 902               enum tree_code code = gimple_assign_rhs_code (stmt);
 903
 904               if ((code == PLUS_EXPR
 905                    || code == POINTER_PLUS_EXPR
 906                    || code == MULT_EXPR)
 907                   && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
 908                 swap_ssa_operands (stmt,
 909                                    gimple_assign_rhs1_ptr (stmt),
 910                                    gimple_assign_rhs2_ptr (stmt));
 911               else if (code == COND_EXPR
 912                        && CONSTANT_CLASS_P (gimple_assign_rhs2 (stmt)))
 913                 {
 914                   tree cond_expr = gimple_assign_rhs1 (stmt);
 915                   enum tree_code cond_code = TREE_CODE (cond_expr);
 916
 917                   if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
 918                     {
 919                       bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr,
 920                                                                   0));
 921                       cond_code = invert_tree_comparison (cond_code,
 922                                                           honor_nans);
 923                       if (cond_code != ERROR_MARK)
 924                         {
 925                           TREE_SET_CODE (cond_expr, cond_code);
 926                           swap_ssa_operands (stmt,
 927                                              gimple_assign_rhs2_ptr (stmt),
 928                                              gimple_assign_rhs3_ptr (stmt));
 929                         }
 930                     }
 931                 }
 932             }
 933           gsi_next (&si);
 934         }
 935     }
 936
 937   free (bbs);
 938
 939   release_vec_loop_masks (&masks);
 940   delete ivexpr_map;
 941
 942   loop->aux = NULL;
 943 }
 944
 945 /* Return an invariant or register for EXPR and emit necessary
 946    computations in the LOOP_VINFO loop preheader.  */
 947
 948 tree
 949 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
 950 {
 951   if (is_gimple_reg (expr)
 952       || is_gimple_min_invariant (expr))
 953     return expr;
 954
 955   if (! loop_vinfo->ivexpr_map)
 956     loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
 957   tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
 958   if (! cached)
 959     {
 960       gimple_seq stmts = NULL;
 961       cached = force_gimple_operand (unshare_expr (expr),
 962                                      &stmts, true, NULL_TREE);
 963       if (stmts)
 964         {
 965           edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
 966           gsi_insert_seq_on_edge_immediate (e, stmts);
 967         }
 968     }
 969   return cached;
 970 }
 971
 972 /* Return true if we can use CMP_TYPE as the comparison type to produce
 973    all masks required to mask LOOP_VINFO.  */
 974
 975 static bool
 976 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
 977 {
 978   rgroup_masks *rgm;
 979   unsigned int i;
 980   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
 981     if (rgm->mask_type != NULL_TREE
 982         && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
 983                                             cmp_type, rgm->mask_type,
 984                                             OPTIMIZE_FOR_SPEED))
 985       return false;
 986   return true;
 987 }
 988
 989 /* Calculate the maximum number of scalars per iteration for every
 990    rgroup in LOOP_VINFO.  */
 991
 992 static unsigned int
 993 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
 994 {
 995   unsigned int res = 1;
 996   unsigned int i;
 997   rgroup_masks *rgm;
 998   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
 999     res = MAX (res, rgm->max_nscalars_per_iter);
1000   return res;
1001 }
1002
1003 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1004    whether we can actually generate the masks required.  Return true if so,
1005    storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE.  */
1006
1007 static bool
1008 vect_verify_full_masking (loop_vec_info loop_vinfo)
1009 {
1010   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1011   unsigned int min_ni_width;
1012
1013   /* Use a normal loop if there are no statements that need masking.
1014      This only happens in rare degenerate cases: it means that the loop
1015      has no loads, no stores, and no live-out values.  */
1016   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1017     return false;
1018
1019   /* Get the maximum number of iterations that is representable
1020      in the counter type.  */
1021   tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1022   widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1023
1024   /* Get a more refined estimate for the number of iterations.  */
1025   widest_int max_back_edges;
1026   if (max_loop_iterations (loop, &max_back_edges))
1027     max_ni = wi::smin (max_ni, max_back_edges + 1);
1028
1029   /* Account for rgroup masks, in which each bit is replicated N times.  */
1030   max_ni *= vect_get_max_nscalars_per_iter (loop_vinfo);
1031
1032   /* Work out how many bits we need to represent the limit.  */
1033   min_ni_width = wi::min_precision (max_ni, UNSIGNED);
1034
1035   /* Find a scalar mode for which WHILE_ULT is supported.  */
1036   opt_scalar_int_mode cmp_mode_iter;
1037   tree cmp_type = NULL_TREE;
1038   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1039     {
1040       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1041       if (cmp_bits >= min_ni_width
1042           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1043         {
1044           tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1045           if (this_type
1046               && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1047             {
1048               /* Although we could stop as soon as we find a valid mode,
1049                  it's often better to continue until we hit Pmode, since the
1050                  operands to the WHILE are more likely to be reusable in
1051                  address calculations.  */
1052               cmp_type = this_type;
1053               if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1054                 break;
1055             }
1056         }
1057     }
1058
1059   if (!cmp_type)
1060     return false;
1061
1062   LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
1063   return true;
1064 }
1065
1066 /* Calculate the cost of one scalar iteration of the loop.  */
1067 static void
1068 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1069 {
1070   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1071   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1072   int nbbs = loop->num_nodes, factor;
1073   int innerloop_iters, i;
1074
1075   DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1076
1077   /* Gather costs for statements in the scalar loop.  */
1078
1079   /* FORNOW.  */
1080   innerloop_iters = 1;
1081   if (loop->inner)
1082     innerloop_iters = 50; /* FIXME */
1083
1084   for (i = 0; i < nbbs; i++)
1085     {
1086       gimple_stmt_iterator si;
1087       basic_block bb = bbs[i];
1088
1089       if (bb->loop_father == loop->inner)
1090         factor = innerloop_iters;
1091       else
1092         factor = 1;
1093
1094       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1095         {
1096           gimple *stmt = gsi_stmt (si);
1097           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1098
1099           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1100             continue;
1101
1102           /* Skip stmts that are not vectorized inside the loop.  */
1103           if (stmt_info
1104               && !STMT_VINFO_RELEVANT_P (stmt_info)
1105               && (!STMT_VINFO_LIVE_P (stmt_info)
1106                   || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1107               && !STMT_VINFO_IN_PATTERN_P (stmt_info))
1108             continue;
1109
1110           vect_cost_for_stmt kind;
1111           if (STMT_VINFO_DATA_REF (stmt_info))
1112             {
1113               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1114                kind = scalar_load;
1115              else
1116                kind = scalar_store;
1117             }
1118           else
1119             kind = scalar_stmt;
1120
1121           record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1122                             factor, kind, stmt_info, 0, vect_prologue);
1123         }
1124     }
1125
1126   /* Now accumulate cost.  */
1127   void *target_cost_data = init_cost (loop);
1128   stmt_info_for_cost *si;
1129   int j;
1130   FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1131                     j, si)
1132     (void) add_stmt_cost (target_cost_data, si->count,
1133                           si->kind, si->stmt_info, si->misalign,
1134                           vect_body);
1135   unsigned dummy, body_cost = 0;
1136   finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
1137   destroy_cost_data (target_cost_data);
1138   LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
1139 }
1140
1141
1142 /* Function vect_analyze_loop_form_1.
1143
1144    Verify that certain CFG restrictions hold, including:
1145    - the loop has a pre-header
1146    - the loop has a single entry and exit
1147    - the loop exit condition is simple enough
1148    - the number of iterations can be analyzed, i.e, a countable loop.  The
1149      niter could be analyzed under some assumptions.  */
1150
1151 opt_result
1152 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1153                           tree *assumptions, tree *number_of_iterationsm1,
1154                           tree *number_of_iterations, gcond **inner_loop_cond)
1155 {
1156   DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1157
1158   /* Different restrictions apply when we are considering an inner-most loop,
1159      vs. an outer (nested) loop.
1160      (FORNOW. May want to relax some of these restrictions in the future).  */
1161
1162   if (!loop->inner)
1163     {
1164       /* Inner-most loop.  We currently require that the number of BBs is
1165          exactly 2 (the header and latch).  Vectorizable inner-most loops
1166          look like this:
1167
1168                         (pre-header)
1169                            |
1170                           header <--------+
1171                            | |            |
1172                            | +--> latch --+
1173                            |
1174                         (exit-bb)  */
1175
1176       if (loop->num_nodes != 2)
1177         return opt_result::failure_at (vect_location,
1178                                        "not vectorized:"
1179                                        " control flow in loop.\n");
1180
1181       if (empty_block_p (loop->header))
1182         return opt_result::failure_at (vect_location,
1183                                        "not vectorized: empty loop.\n");
1184     }
1185   else
1186     {
1187       struct loop *innerloop = loop->inner;
1188       edge entryedge;
1189
1190       /* Nested loop. We currently require that the loop is doubly-nested,
1191          contains a single inner loop, and the number of BBs is exactly 5.
1192          Vectorizable outer-loops look like this:
1193
1194                         (pre-header)
1195                            |
1196                           header <---+
1197                            |         |
1198                           inner-loop |
1199                            |         |
1200                           tail ------+
1201                            |
1202                         (exit-bb)
1203
1204          The inner-loop has the properties expected of inner-most loops
1205          as described above.  */
1206
1207       if ((loop->inner)->inner || (loop->inner)->next)
1208         return opt_result::failure_at (vect_location,
1209                                        "not vectorized:"
1210                                        " multiple nested loops.\n");
1211
1212       if (loop->num_nodes != 5)
1213         return opt_result::failure_at (vect_location,
1214                                        "not vectorized:"
1215                                        " control flow in loop.\n");
1216
1217       entryedge = loop_preheader_edge (innerloop);
1218       if (entryedge->src != loop->header
1219           || !single_exit (innerloop)
1220           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1221         return opt_result::failure_at (vect_location,
1222                                        "not vectorized:"
1223                                        " unsupported outerloop form.\n");
1224
1225       /* Analyze the inner-loop.  */
1226       tree inner_niterm1, inner_niter, inner_assumptions;
1227       opt_result res
1228         = vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1229                                     &inner_assumptions, &inner_niterm1,
1230                                     &inner_niter, NULL);
1231       if (!res)
1232         {
1233           if (dump_enabled_p ())
1234             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1235                              "not vectorized: Bad inner loop.\n");
1236           return res;
1237         }
1238
1239       /* Don't support analyzing niter under assumptions for inner
1240          loop.  */
1241       if (!integer_onep (inner_assumptions))
1242         return opt_result::failure_at (vect_location,
1243                                        "not vectorized: Bad inner loop.\n");
1244
1245       if (!expr_invariant_in_loop_p (loop, inner_niter))
1246         return opt_result::failure_at (vect_location,
1247                                        "not vectorized: inner-loop count not"
1248                                        " invariant.\n");
1249
1250       if (dump_enabled_p ())
1251         dump_printf_loc (MSG_NOTE, vect_location,
1252                          "Considering outer-loop vectorization.\n");
1253     }
1254
1255   if (!single_exit (loop))
1256     return opt_result::failure_at (vect_location,
1257                                    "not vectorized: multiple exits.\n");
1258   if (EDGE_COUNT (loop->header->preds) != 2)
1259     return opt_result::failure_at (vect_location,
1260                                    "not vectorized:"
1261                                    " too many incoming edges.\n");
1262
1263   /* We assume that the loop exit condition is at the end of the loop. i.e,
1264      that the loop is represented as a do-while (with a proper if-guard
1265      before the loop if needed), where the loop header contains all the
1266      executable statements, and the latch is empty.  */
1267   if (!empty_block_p (loop->latch)
1268       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1269     return opt_result::failure_at (vect_location,
1270                                    "not vectorized: latch block not empty.\n");
1271
1272   /* Make sure the exit is not abnormal.  */
1273   edge e = single_exit (loop);
1274   if (e->flags & EDGE_ABNORMAL)
1275     return opt_result::failure_at (vect_location,
1276                                    "not vectorized:"
1277                                    " abnormal loop exit edge.\n");
1278
1279   *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1280                                      number_of_iterationsm1);
1281   if (!*loop_cond)
1282     return opt_result::failure_at
1283       (vect_location,
1284        "not vectorized: complicated exit condition.\n");
1285
1286   if (integer_zerop (*assumptions)
1287       || !*number_of_iterations
1288       || chrec_contains_undetermined (*number_of_iterations))
1289     return opt_result::failure_at
1290       (*loop_cond,
1291        "not vectorized: number of iterations cannot be computed.\n");
1292
1293   if (integer_zerop (*number_of_iterations))
1294     return opt_result::failure_at
1295       (*loop_cond,
1296        "not vectorized: number of iterations = 0.\n");
1297
1298   return opt_result::success ();
1299 }
1300
1301 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form.  */
1302
1303 opt_loop_vec_info
1304 vect_analyze_loop_form (struct loop *loop, vec_info_shared *shared)
1305 {
1306   tree assumptions, number_of_iterations, number_of_iterationsm1;
1307   gcond *loop_cond, *inner_loop_cond = NULL;
1308
1309   opt_result res
1310     = vect_analyze_loop_form_1 (loop, &loop_cond,
1311                                 &assumptions, &number_of_iterationsm1,
1312                                 &number_of_iterations, &inner_loop_cond);
1313   if (!res)
1314     return opt_loop_vec_info::propagate_failure (res);
1315
1316   loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1317   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1318   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1319   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1320   if (!integer_onep (assumptions))
1321     {
1322       /* We consider to vectorize this loop by versioning it under
1323          some assumptions.  In order to do this, we need to clear
1324          existing information computed by scev and niter analyzer.  */
1325       scev_reset_htab ();
1326       free_numbers_of_iterations_estimates (loop);
1327       /* Also set flag for this loop so that following scev and niter
1328          analysis are done under the assumptions.  */
1329       loop_constraint_set (loop, LOOP_C_FINITE);
1330       /* Also record the assumptions for versioning.  */
1331       LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1332     }
1333
1334   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1335     {
1336       if (dump_enabled_p ())
1337         {
1338           dump_printf_loc (MSG_NOTE, vect_location,
1339                            "Symbolic number of iterations is ");
1340           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1341           dump_printf (MSG_NOTE, "\n");
1342         }
1343     }
1344
1345   stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (loop_cond);
1346   STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1347   if (inner_loop_cond)
1348     {
1349       stmt_vec_info inner_loop_cond_info
1350         = loop_vinfo->lookup_stmt (inner_loop_cond);
1351       STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1352     }
1353
1354   gcc_assert (!loop->aux);
1355   loop->aux = loop_vinfo;
1356   return opt_loop_vec_info::success (loop_vinfo);
1357 }
1358
1359
1360
1361 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1362    statements update the vectorization factor.  */
1363
1364 static void
1365 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1366 {
1367   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1368   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1369   int nbbs = loop->num_nodes;
1370   poly_uint64 vectorization_factor;
1371   int i;
1372
1373   DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1374
1375   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1376   gcc_assert (known_ne (vectorization_factor, 0U));
1377
1378   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1379      vectorization factor of the loop is the unrolling factor required by
1380      the SLP instances.  If that unrolling factor is 1, we say, that we
1381      perform pure SLP on loop - cross iteration parallelism is not
1382      exploited.  */
1383   bool only_slp_in_loop = true;
1384   for (i = 0; i < nbbs; i++)
1385     {
1386       basic_block bb = bbs[i];
1387       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1388            gsi_next (&si))
1389         {
1390           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1391           stmt_info = vect_stmt_to_vectorize (stmt_info);
1392           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1393                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1394               && !PURE_SLP_STMT (stmt_info))
1395             /* STMT needs both SLP and loop-based vectorization.  */
1396             only_slp_in_loop = false;
1397         }
1398     }
1399
1400   if (only_slp_in_loop)
1401     {
1402       dump_printf_loc (MSG_NOTE, vect_location,
1403                        "Loop contains only SLP stmts\n");
1404       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1405     }
1406   else
1407     {
1408       dump_printf_loc (MSG_NOTE, vect_location,
1409                        "Loop contains SLP and non-SLP stmts\n");
1410       /* Both the vectorization factor and unroll factor have the form
1411          current_vector_size * X for some rational X, so they must have
1412          a common multiple.  */
1413       vectorization_factor
1414         = force_common_multiple (vectorization_factor,
1415                                  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1416     }
1417
1418   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1419   if (dump_enabled_p ())
1420     {
1421       dump_printf_loc (MSG_NOTE, vect_location,
1422                        "Updating vectorization factor to ");
1423       dump_dec (MSG_NOTE, vectorization_factor);
1424       dump_printf (MSG_NOTE, ".\n");
1425     }
1426 }
1427
1428 /* Return true if STMT_INFO describes a double reduction phi and if
1429    the other phi in the reduction is also relevant for vectorization.
1430    This rejects cases such as:
1431
1432       outer1:
1433         x_1 = PHI <x_3(outer2), ...>;
1434         ...
1435
1436       inner:
1437         x_2 = ...;
1438         ...
1439
1440       outer2:
1441         x_3 = PHI <x_2(inner)>;
1442
1443    if nothing in x_2 or elsewhere makes x_1 relevant.  */
1444
1445 static bool
1446 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1447 {
1448   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1449     return false;
1450
1451   return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1452 }
1453
1454 /* Function vect_analyze_loop_operations.
1455
1456    Scan the loop stmts and make sure they are all vectorizable.  */
1457
1458 static opt_result
1459 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1460 {
1461   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1462   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1463   int nbbs = loop->num_nodes;
1464   int i;
1465   stmt_vec_info stmt_info;
1466   bool need_to_vectorize = false;
1467   bool ok;
1468
1469   DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1470
1471   stmt_vector_for_cost cost_vec;
1472   cost_vec.create (2);
1473
1474   for (i = 0; i < nbbs; i++)
1475     {
1476       basic_block bb = bbs[i];
1477
1478       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1479            gsi_next (&si))
1480         {
1481           gphi *phi = si.phi ();
1482           ok = true;
1483
1484           stmt_info = loop_vinfo->lookup_stmt (phi);
1485           if (dump_enabled_p ())
1486             dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G", phi);
1487           if (virtual_operand_p (gimple_phi_result (phi)))
1488             continue;
1489
1490           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1491              (i.e., a phi in the tail of the outer-loop).  */
1492           if (! is_loop_header_bb_p (bb))
1493             {
1494               /* FORNOW: we currently don't support the case that these phis
1495                  are not used in the outerloop (unless it is double reduction,
1496                  i.e., this phi is vect_reduction_def), cause this case
1497                  requires to actually do something here.  */
1498               if (STMT_VINFO_LIVE_P (stmt_info)
1499                   && !vect_active_double_reduction_p (stmt_info))
1500                 return opt_result::failure_at (phi,
1501                                                "Unsupported loop-closed phi"
1502                                                " in outer-loop.\n");
1503
1504               /* If PHI is used in the outer loop, we check that its operand
1505                  is defined in the inner loop.  */
1506               if (STMT_VINFO_RELEVANT_P (stmt_info))
1507                 {
1508                   tree phi_op;
1509
1510                   if (gimple_phi_num_args (phi) != 1)
1511                     return opt_result::failure_at (phi, "unsupported phi");
1512
1513                   phi_op = PHI_ARG_DEF (phi, 0);
1514                   stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1515                   if (!op_def_info)
1516                     return opt_result::failure_at (phi, "unsupported phi");
1517
1518                   if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1519                       && (STMT_VINFO_RELEVANT (op_def_info)
1520                           != vect_used_in_outer_by_reduction))
1521                     return opt_result::failure_at (phi, "unsupported phi");
1522                 }
1523
1524               continue;
1525             }
1526
1527           gcc_assert (stmt_info);
1528
1529           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1530                || STMT_VINFO_LIVE_P (stmt_info))
1531               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1532             /* A scalar-dependence cycle that we don't support.  */
1533             return opt_result::failure_at (phi,
1534                                            "not vectorized:"
1535                                            " scalar dependence cycle.\n");
1536
1537           if (STMT_VINFO_RELEVANT_P (stmt_info))
1538             {
1539               need_to_vectorize = true;
1540               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1541                   && ! PURE_SLP_STMT (stmt_info))
1542                 ok = vectorizable_induction (stmt_info, NULL, NULL, NULL,
1543                                              &cost_vec);
1544               else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1545                         || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1546                        && ! PURE_SLP_STMT (stmt_info))
1547                 ok = vectorizable_reduction (stmt_info, NULL, NULL, NULL, NULL,
1548                                              &cost_vec);
1549             }
1550
1551           /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
1552           if (ok
1553               && STMT_VINFO_LIVE_P (stmt_info)
1554               && !PURE_SLP_STMT (stmt_info))
1555             ok = vectorizable_live_operation (stmt_info, NULL, NULL, -1, NULL,
1556                                               &cost_vec);
1557
1558           if (!ok)
1559             return opt_result::failure_at (phi,
1560                                            "not vectorized: relevant phi not "
1561                                            "supported: %G",
1562                                            static_cast <gimple *> (phi));
1563         }
1564
1565       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1566            gsi_next (&si))
1567         {
1568           gimple *stmt = gsi_stmt (si);
1569           if (!gimple_clobber_p (stmt))
1570             {
1571               opt_result res
1572                 = vect_analyze_stmt (loop_vinfo->lookup_stmt (stmt),
1573                                      &need_to_vectorize,
1574                                      NULL, NULL, &cost_vec);
1575               if (!res)
1576                 return res;
1577             }
1578         }
1579     } /* bbs */
1580
1581   add_stmt_costs (loop_vinfo->target_cost_data, &cost_vec);
1582   cost_vec.release ();
1583
1584   /* All operations in the loop are either irrelevant (deal with loop
1585      control, or dead), or only used outside the loop and can be moved
1586      out of the loop (e.g. invariants, inductions).  The loop can be
1587      optimized away by scalar optimizations.  We're better off not
1588      touching this loop.  */
1589   if (!need_to_vectorize)
1590     {
1591       if (dump_enabled_p ())
1592         dump_printf_loc (MSG_NOTE, vect_location,
1593                          "All the computation can be taken out of the loop.\n");
1594       return opt_result::failure_at
1595         (vect_location,
1596          "not vectorized: redundant loop. no profit to vectorize.\n");
1597     }
1598
1599   return opt_result::success ();
1600 }
1601
1602 /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
1603    is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
1604    definitely no, or -1 if it's worth retrying.  */
1605
1606 static int
1607 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1608 {
1609   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1610   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1611
1612   /* Only fully-masked loops can have iteration counts less than the
1613      vectorization factor.  */
1614   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1615     {
1616       HOST_WIDE_INT max_niter;
1617
1618       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1619         max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1620       else
1621         max_niter = max_stmt_executions_int (loop);
1622
1623       if (max_niter != -1
1624           && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1625         {
1626           if (dump_enabled_p ())
1627             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1628                              "not vectorized: iteration count smaller than "
1629                              "vectorization factor.\n");
1630           return 0;
1631         }
1632     }
1633
1634   int min_profitable_iters, min_profitable_estimate;
1635   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1636                                       &min_profitable_estimate);
1637
1638   if (min_profitable_iters < 0)
1639     {
1640       if (dump_enabled_p ())
1641         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1642                          "not vectorized: vectorization not profitable.\n");
1643       if (dump_enabled_p ())
1644         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1645                          "not vectorized: vector version will never be "
1646                          "profitable.\n");
1647       return -1;
1648     }
1649
1650   int min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1651                                * assumed_vf);
1652
1653   /* Use the cost model only if it is more conservative than user specified
1654      threshold.  */
1655   unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1656                                     min_profitable_iters);
1657
1658   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1659
1660   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1661       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1662     {
1663       if (dump_enabled_p ())
1664         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1665                          "not vectorized: vectorization not profitable.\n");
1666       if (dump_enabled_p ())
1667         dump_printf_loc (MSG_NOTE, vect_location,
1668                          "not vectorized: iteration count smaller than user "
1669                          "specified loop bound parameter or minimum profitable "
1670                          "iterations (whichever is more conservative).\n");
1671       return 0;
1672     }
1673
1674   HOST_WIDE_INT estimated_niter = estimated_stmt_executions_int (loop);
1675   if (estimated_niter == -1)
1676     estimated_niter = likely_max_stmt_executions_int (loop);
1677   if (estimated_niter != -1
1678       && ((unsigned HOST_WIDE_INT) estimated_niter
1679           < MAX (th, (unsigned) min_profitable_estimate)))
1680     {
1681       if (dump_enabled_p ())
1682         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1683                          "not vectorized: estimated iteration count too "
1684                          "small.\n");
1685       if (dump_enabled_p ())
1686         dump_printf_loc (MSG_NOTE, vect_location,
1687                          "not vectorized: estimated iteration count smaller "
1688                          "than specified loop bound parameter or minimum "
1689                          "profitable iterations (whichever is more "
1690                          "conservative).\n");
1691       return -1;
1692     }
1693
1694   return 1;
1695 }
1696
1697 static opt_result
1698 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1699                            vec<data_reference_p> *datarefs,
1700                            unsigned int *n_stmts)
1701 {
1702   *n_stmts = 0;
1703   for (unsigned i = 0; i < loop->num_nodes; i++)
1704     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1705          !gsi_end_p (gsi); gsi_next (&gsi))
1706       {
1707         gimple *stmt = gsi_stmt (gsi);
1708         if (is_gimple_debug (stmt))
1709           continue;
1710         ++(*n_stmts);
1711         opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs);
1712         if (!res)
1713           {
1714             if (is_gimple_call (stmt) && loop->safelen)
1715               {
1716                 tree fndecl = gimple_call_fndecl (stmt), op;
1717                 if (fndecl != NULL_TREE)
1718                   {
1719                     cgraph_node *node = cgraph_node::get (fndecl);
1720                     if (node != NULL && node->simd_clones != NULL)
1721                       {
1722                         unsigned int j, n = gimple_call_num_args (stmt);
1723                         for (j = 0; j < n; j++)
1724                           {
1725                             op = gimple_call_arg (stmt, j);
1726                             if (DECL_P (op)
1727                                 || (REFERENCE_CLASS_P (op)
1728                                     && get_base_address (op)))
1729                               break;
1730                           }
1731                         op = gimple_call_lhs (stmt);
1732                         /* Ignore #pragma omp declare simd functions
1733                            if they don't have data references in the
1734                            call stmt itself.  */
1735                         if (j == n
1736                             && !(op
1737                                  && (DECL_P (op)
1738                                      || (REFERENCE_CLASS_P (op)
1739                                          && get_base_address (op)))))
1740                           continue;
1741                       }
1742                   }
1743               }
1744             return res;
1745           }
1746         /* If dependence analysis will give up due to the limit on the
1747            number of datarefs stop here and fail fatally.  */
1748         if (datarefs->length ()
1749             > (unsigned)PARAM_VALUE (PARAM_LOOP_MAX_DATAREFS_FOR_DATADEPS))
1750           return opt_result::failure_at (stmt, "exceeded param "
1751                                          "loop-max-datarefs-for-datadeps\n");
1752       }
1753   return opt_result::success ();
1754 }
1755
1756 /* Function vect_analyze_loop_2.
1757
1758    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1759    for it.  The different analyses will record information in the
1760    loop_vec_info struct.  */
1761 static opt_result
1762 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
1763 {
1764   opt_result ok = opt_result::success ();
1765   int res;
1766   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
1767   poly_uint64 min_vf = 2;
1768
1769   /* The first group of checks is independent of the vector size.  */
1770   fatal = true;
1771
1772   /* Find all data references in the loop (which correspond to vdefs/vuses)
1773      and analyze their evolution in the loop.  */
1774
1775   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1776
1777   /* Gather the data references and count stmts in the loop.  */
1778   if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
1779     {
1780       opt_result res
1781         = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
1782                                      &LOOP_VINFO_DATAREFS (loop_vinfo),
1783                                      n_stmts);
1784       if (!res)
1785         {
1786           if (dump_enabled_p ())
1787             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1788                              "not vectorized: loop contains function "
1789                              "calls or data references that cannot "
1790                              "be analyzed\n");
1791           return res;
1792         }
1793       loop_vinfo->shared->save_datarefs ();
1794     }
1795   else
1796     loop_vinfo->shared->check_datarefs ();
1797
1798   /* Analyze the data references and also adjust the minimal
1799      vectorization factor according to the loads and stores.  */
1800
1801   ok = vect_analyze_data_refs (loop_vinfo, &min_vf);
1802   if (!ok)
1803     {
1804       if (dump_enabled_p ())
1805         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1806                          "bad data references.\n");
1807       return ok;
1808     }
1809
1810   /* Classify all cross-iteration scalar data-flow cycles.
1811      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1812   vect_analyze_scalar_cycles (loop_vinfo);
1813
1814   vect_pattern_recog (loop_vinfo);
1815
1816   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1817
1818   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1819      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1820
1821   ok = vect_analyze_data_ref_accesses (loop_vinfo);
1822   if (!ok)
1823     {
1824       if (dump_enabled_p ())
1825         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1826                          "bad data access.\n");
1827       return ok;
1828     }
1829
1830   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1831
1832   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1833   if (!ok)
1834     {
1835       if (dump_enabled_p ())
1836         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1837                          "unexpected pattern.\n");
1838       return ok;
1839     }
1840
1841   /* While the rest of the analysis below depends on it in some way.  */
1842   fatal = false;
1843
1844   /* Analyze data dependences between the data-refs in the loop
1845      and adjust the maximum vectorization factor according to
1846      the dependences.
1847      FORNOW: fail at the first data dependence that we encounter.  */
1848
1849   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1850   if (!ok)
1851     {
1852       if (dump_enabled_p ())
1853         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1854                          "bad data dependence.\n");
1855       return ok;
1856     }
1857   if (max_vf != MAX_VECTORIZATION_FACTOR
1858       && maybe_lt (max_vf, min_vf))
1859     return opt_result::failure_at (vect_location, "bad data dependence.\n");
1860   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
1861
1862   ok = vect_determine_vectorization_factor (loop_vinfo);
1863   if (!ok)
1864     {
1865       if (dump_enabled_p ())
1866         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1867                          "can't determine vectorization factor.\n");
1868       return ok;
1869     }
1870   if (max_vf != MAX_VECTORIZATION_FACTOR
1871       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1872     return opt_result::failure_at (vect_location, "bad data dependence.\n");
1873
1874   /* Compute the scalar iteration cost.  */
1875   vect_compute_single_scalar_iteration_cost (loop_vinfo);
1876
1877   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1878   unsigned th;
1879
1880   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
1881   ok = vect_analyze_slp (loop_vinfo, *n_stmts);
1882   if (!ok)
1883     return ok;
1884
1885   /* If there are any SLP instances mark them as pure_slp.  */
1886   bool slp = vect_make_slp_decision (loop_vinfo);
1887   if (slp)
1888     {
1889       /* Find stmts that need to be both vectorized and SLPed.  */
1890       vect_detect_hybrid_slp (loop_vinfo);
1891
1892       /* Update the vectorization factor based on the SLP decision.  */
1893       vect_update_vf_for_slp (loop_vinfo);
1894     }
1895
1896   bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo);
1897
1898   /* We don't expect to have to roll back to anything other than an empty
1899      set of rgroups.  */
1900   gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
1901
1902   /* This is the point where we can re-start analysis with SLP forced off.  */
1903 start_over:
1904
1905   /* Now the vectorization factor is final.  */
1906   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1907   gcc_assert (known_ne (vectorization_factor, 0U));
1908
1909   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1910     {
1911       dump_printf_loc (MSG_NOTE, vect_location,
1912                        "vectorization_factor = ");
1913       dump_dec (MSG_NOTE, vectorization_factor);
1914       dump_printf (MSG_NOTE, ", niters = %wd\n",
1915                    LOOP_VINFO_INT_NITERS (loop_vinfo));
1916     }
1917
1918   HOST_WIDE_INT max_niter
1919     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1920
1921   /* Analyze the alignment of the data-refs in the loop.
1922      Fail if a data reference is found that cannot be vectorized.  */
1923
1924   ok = vect_analyze_data_refs_alignment (loop_vinfo);
1925   if (!ok)
1926     {
1927       if (dump_enabled_p ())
1928         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1929                          "bad data alignment.\n");
1930       return ok;
1931     }
1932
1933   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1934      It is important to call pruning after vect_analyze_data_ref_accesses,
1935      since we use grouping information gathered by interleaving analysis.  */
1936   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1937   if (!ok)
1938     return ok;
1939
1940   /* Do not invoke vect_enhance_data_refs_alignment for epilogue
1941      vectorization, since we do not want to add extra peeling or
1942      add versioning for alignment.  */
1943   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1944     /* This pass will decide on using loop versioning and/or loop peeling in
1945        order to enhance the alignment of data references in the loop.  */
1946     ok = vect_enhance_data_refs_alignment (loop_vinfo);
1947   else
1948     ok = vect_verify_datarefs_alignment (loop_vinfo);
1949   if (!ok)
1950     return ok;
1951
1952   if (slp)
1953     {
1954       /* Analyze operations in the SLP instances.  Note this may
1955          remove unsupported SLP instances which makes the above
1956          SLP kind detection invalid.  */
1957       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
1958       vect_slp_analyze_operations (loop_vinfo);
1959       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
1960         {
1961           ok = opt_result::failure_at (vect_location,
1962                                        "unsupported SLP instances\n");
1963           goto again;
1964         }
1965     }
1966
1967   /* Scan all the remaining operations in the loop that are not subject
1968      to SLP and make sure they are vectorizable.  */
1969   ok = vect_analyze_loop_operations (loop_vinfo);
1970   if (!ok)
1971     {
1972       if (dump_enabled_p ())
1973         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1974                          "bad operation or unsupported loop bound.\n");
1975       return ok;
1976     }
1977
1978   /* Decide whether to use a fully-masked loop for this vectorization
1979      factor.  */
1980   LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
1981     = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
1982        && vect_verify_full_masking (loop_vinfo));
1983   if (dump_enabled_p ())
1984     {
1985       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1986         dump_printf_loc (MSG_NOTE, vect_location,
1987                          "using a fully-masked loop.\n");
1988       else
1989         dump_printf_loc (MSG_NOTE, vect_location,
1990                          "not using a fully-masked loop.\n");
1991     }
1992
1993   /* If epilog loop is required because of data accesses with gaps,
1994      one additional iteration needs to be peeled.  Check if there is
1995      enough iterations for vectorization.  */
1996   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1997       && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1998       && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1999     {
2000       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2001       tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2002
2003       if (known_lt (wi::to_widest (scalar_niters), vf))
2004         return opt_result::failure_at (vect_location,
2005                                        "loop has no enough iterations to"
2006                                        " support peeling for gaps.\n");
2007     }
2008
2009   /* Check the costings of the loop make vectorizing worthwhile.  */
2010   res = vect_analyze_loop_costing (loop_vinfo);
2011   if (res < 0)
2012     {
2013       ok = opt_result::failure_at (vect_location,
2014                                    "Loop costings may not be worthwhile.\n");
2015       goto again;
2016     }
2017   if (!res)
2018     return opt_result::failure_at (vect_location,
2019                                    "Loop costings not worthwhile.\n");
2020
2021   /* Decide whether we need to create an epilogue loop to handle
2022      remaining scalar iterations.  */
2023   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2024
2025   unsigned HOST_WIDE_INT const_vf;
2026   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2027     /* The main loop handles all iterations.  */
2028     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2029   else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2030            && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
2031     {
2032       /* Work out the (constant) number of iterations that need to be
2033          peeled for reasons other than niters.  */
2034       unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2035       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2036         peel_niter += 1;
2037       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
2038                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2039         LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2040     }
2041   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2042            /* ??? When peeling for gaps but not alignment, we could
2043               try to check whether the (variable) niters is known to be
2044               VF * N + 1.  That's something of a niche case though.  */
2045            || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2046            || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
2047            || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2048                 < (unsigned) exact_log2 (const_vf))
2049                /* In case of versioning, check if the maximum number of
2050                   iterations is greater than th.  If they are identical,
2051                   the epilogue is unnecessary.  */
2052                && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2053                    || ((unsigned HOST_WIDE_INT) max_niter
2054                        > (th / const_vf) * const_vf))))
2055     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2056
2057   /* If an epilogue loop is required make sure we can create one.  */
2058   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2059       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2060     {
2061       if (dump_enabled_p ())
2062         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2063       if (!vect_can_advance_ivs_p (loop_vinfo)
2064           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2065                                            single_exit (LOOP_VINFO_LOOP
2066                                                          (loop_vinfo))))
2067         {
2068           ok = opt_result::failure_at (vect_location,
2069                                        "not vectorized: can't create required "
2070                                        "epilog loop\n");
2071           goto again;
2072         }
2073     }
2074
2075   /* During peeling, we need to check if number of loop iterations is
2076      enough for both peeled prolog loop and vector loop.  This check
2077      can be merged along with threshold check of loop versioning, so
2078      increase threshold for this case if necessary.  */
2079   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
2080     {
2081       poly_uint64 niters_th = 0;
2082
2083       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2084         {
2085           /* Niters for peeled prolog loop.  */
2086           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2087             {
2088               dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2089               tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2090               niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2091             }
2092           else
2093             niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2094         }
2095
2096       /* Niters for at least one iteration of vectorized loop.  */
2097       if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2098         niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2099       /* One additional iteration because of peeling for gap.  */
2100       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2101         niters_th += 1;
2102       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2103     }
2104
2105   gcc_assert (known_eq (vectorization_factor,
2106                         LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2107
2108   /* Ok to vectorize!  */
2109   return opt_result::success ();
2110
2111 again:
2112   /* Ensure that "ok" is false (with an opt_problem if dumping is enabled).  */
2113   gcc_assert (!ok);
2114
2115   /* Try again with SLP forced off but if we didn't do any SLP there is
2116      no point in re-trying.  */
2117   if (!slp)
2118     return ok;
2119
2120   /* If there are reduction chains re-trying will fail anyway.  */
2121   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2122     return ok;
2123
2124   /* Likewise if the grouped loads or stores in the SLP cannot be handled
2125      via interleaving or lane instructions.  */
2126   slp_instance instance;
2127   slp_tree node;
2128   unsigned i, j;
2129   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2130     {
2131       stmt_vec_info vinfo;
2132       vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2133       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2134         continue;
2135       vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2136       unsigned int size = DR_GROUP_SIZE (vinfo);
2137       tree vectype = STMT_VINFO_VECTYPE (vinfo);
2138       if (! vect_store_lanes_supported (vectype, size, false)
2139          && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2140          && ! vect_grouped_store_supported (vectype, size))
2141         return opt_result::failure_at (vinfo->stmt,
2142                                        "unsupported grouped store\n");
2143       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2144         {
2145           vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
2146           vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2147           bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2148           size = DR_GROUP_SIZE (vinfo);
2149           vectype = STMT_VINFO_VECTYPE (vinfo);
2150           if (! vect_load_lanes_supported (vectype, size, false)
2151               && ! vect_grouped_load_supported (vectype, single_element_p,
2152                                                 size))
2153             return opt_result::failure_at (vinfo->stmt,
2154                                            "unsupported grouped load\n");
2155         }
2156     }
2157
2158   if (dump_enabled_p ())
2159     dump_printf_loc (MSG_NOTE, vect_location,
2160                      "re-trying with SLP disabled\n");
2161
2162   /* Roll back state appropriately.  No SLP this time.  */
2163   slp = false;
2164   /* Restore vectorization factor as it were without SLP.  */
2165   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2166   /* Free the SLP instances.  */
2167   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2168     vect_free_slp_instance (instance, false);
2169   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2170   /* Reset SLP type to loop_vect on all stmts.  */
2171   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2172     {
2173       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2174       for (gimple_stmt_iterator si = gsi_start_phis (bb);
2175            !gsi_end_p (si); gsi_next (&si))
2176         {
2177           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2178           STMT_SLP_TYPE (stmt_info) = loop_vect;
2179         }
2180       for (gimple_stmt_iterator si = gsi_start_bb (bb);
2181            !gsi_end_p (si); gsi_next (&si))
2182         {
2183           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2184           STMT_SLP_TYPE (stmt_info) = loop_vect;
2185           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2186             {
2187               gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2188               stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
2189               STMT_SLP_TYPE (stmt_info) = loop_vect;
2190               for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2191                    !gsi_end_p (pi); gsi_next (&pi))
2192                 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2193                   = loop_vect;
2194             }
2195         }
2196     }
2197   /* Free optimized alias test DDRS.  */
2198   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2199   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2200   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2201   /* Reset target cost data.  */
2202   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2203   LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2204     = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2205   /* Reset accumulated rgroup information.  */
2206   release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo));
2207   /* Reset assorted flags.  */
2208   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2209   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2210   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2211   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2212   LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p;
2213
2214   goto start_over;
2215 }
2216
2217 /* Function vect_analyze_loop.
2218
2219    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2220    for it.  The different analyses will record information in the
2221    loop_vec_info struct.  If ORIG_LOOP_VINFO is not NULL epilogue must
2222    be vectorized.  */
2223 opt_loop_vec_info
2224 vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo,
2225                    vec_info_shared *shared)
2226 {
2227   auto_vector_sizes vector_sizes;
2228
2229   /* Autodetect first vector size we try.  */
2230   current_vector_size = 0;
2231   targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
2232   unsigned int next_size = 0;
2233
2234   DUMP_VECT_SCOPE ("analyze_loop_nest");
2235
2236   if (loop_outer (loop)
2237       && loop_vec_info_for_loop (loop_outer (loop))
2238       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2239     return opt_loop_vec_info::failure_at (vect_location,
2240                                           "outer-loop already vectorized.\n");
2241
2242   if (!find_loop_nest (loop, &shared->loop_nest))
2243     return opt_loop_vec_info::failure_at
2244       (vect_location,
2245        "not vectorized: loop nest containing two or more consecutive inner"
2246        " loops cannot be vectorized\n");
2247
2248   unsigned n_stmts = 0;
2249   poly_uint64 autodetected_vector_size = 0;
2250   while (1)
2251     {
2252       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
2253       opt_loop_vec_info loop_vinfo
2254         = vect_analyze_loop_form (loop, shared);
2255       if (!loop_vinfo)
2256         {
2257           if (dump_enabled_p ())
2258             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2259                              "bad loop form.\n");
2260           return loop_vinfo;
2261         }
2262
2263       bool fatal = false;
2264
2265       if (orig_loop_vinfo)
2266         LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo;
2267
2268       opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts);
2269       if (res)
2270         {
2271           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2272
2273           return loop_vinfo;
2274         }
2275
2276       delete loop_vinfo;
2277
2278       if (next_size == 0)
2279         autodetected_vector_size = current_vector_size;
2280
2281       if (next_size < vector_sizes.length ()
2282           && known_eq (vector_sizes[next_size], autodetected_vector_size))
2283         next_size += 1;
2284
2285       if (fatal
2286           || next_size == vector_sizes.length ()
2287           || known_eq (current_vector_size, 0U))
2288         return opt_loop_vec_info::propagate_failure (res);
2289
2290       /* Try the next biggest vector size.  */
2291       current_vector_size = vector_sizes[next_size++];
2292       if (dump_enabled_p ())
2293         {
2294           dump_printf_loc (MSG_NOTE, vect_location,
2295                            "***** Re-trying analysis with "
2296                            "vector size ");
2297           dump_dec (MSG_NOTE, current_vector_size);
2298           dump_printf (MSG_NOTE, "\n");
2299         }
2300     }
2301 }
2302
2303 /* Return true if there is an in-order reduction function for CODE, storing
2304    it in *REDUC_FN if so.  */
2305
2306 static bool
2307 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
2308 {
2309   switch (code)
2310     {
2311     case PLUS_EXPR:
2312       *reduc_fn = IFN_FOLD_LEFT_PLUS;
2313       return true;
2314
2315     default:
2316       return false;
2317     }
2318 }
2319
2320 /* Function reduction_fn_for_scalar_code
2321
2322    Input:
2323    CODE - tree_code of a reduction operations.
2324
2325    Output:
2326    REDUC_FN - the corresponding internal function to be used to reduce the
2327       vector of partial results into a single scalar result, or IFN_LAST
2328       if the operation is a supported reduction operation, but does not have
2329       such an internal function.
2330
2331    Return FALSE if CODE currently cannot be vectorized as reduction.  */
2332
2333 static bool
2334 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
2335 {
2336   switch (code)
2337     {
2338       case MAX_EXPR:
2339         *reduc_fn = IFN_REDUC_MAX;
2340         return true;
2341
2342       case MIN_EXPR:
2343         *reduc_fn = IFN_REDUC_MIN;
2344         return true;
2345
2346       case PLUS_EXPR:
2347         *reduc_fn = IFN_REDUC_PLUS;
2348         return true;
2349
2350       case BIT_AND_EXPR:
2351         *reduc_fn = IFN_REDUC_AND;
2352         return true;
2353
2354       case BIT_IOR_EXPR:
2355         *reduc_fn = IFN_REDUC_IOR;
2356         return true;
2357
2358       case BIT_XOR_EXPR:
2359         *reduc_fn = IFN_REDUC_XOR;
2360         return true;
2361
2362       case MULT_EXPR:
2363       case MINUS_EXPR:
2364         *reduc_fn = IFN_LAST;
2365         return true;
2366
2367       default:
2368        return false;
2369     }
2370 }
2371
2372 /* If there is a neutral value X such that SLP reduction NODE would not
2373    be affected by the introduction of additional X elements, return that X,
2374    otherwise return null.  CODE is the code of the reduction.  REDUC_CHAIN
2375    is true if the SLP statements perform a single reduction, false if each
2376    statement performs an independent reduction.  */
2377
2378 static tree
2379 neutral_op_for_slp_reduction (slp_tree slp_node, tree_code code,
2380                               bool reduc_chain)
2381 {
2382   vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2383   stmt_vec_info stmt_vinfo = stmts[0];
2384   tree vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
2385   tree scalar_type = TREE_TYPE (vector_type);
2386   struct loop *loop = gimple_bb (stmt_vinfo->stmt)->loop_father;
2387   gcc_assert (loop);
2388
2389   switch (code)
2390     {
2391     case WIDEN_SUM_EXPR:
2392     case DOT_PROD_EXPR:
2393     case SAD_EXPR:
2394     case PLUS_EXPR:
2395     case MINUS_EXPR:
2396     case BIT_IOR_EXPR:
2397     case BIT_XOR_EXPR:
2398       return build_zero_cst (scalar_type);
2399
2400     case MULT_EXPR:
2401       return build_one_cst (scalar_type);
2402
2403     case BIT_AND_EXPR:
2404       return build_all_ones_cst (scalar_type);
2405
2406     case MAX_EXPR:
2407     case MIN_EXPR:
2408       /* For MIN/MAX the initial values are neutral.  A reduction chain
2409          has only a single initial value, so that value is neutral for
2410          all statements.  */
2411       if (reduc_chain)
2412         return PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt,
2413                                       loop_preheader_edge (loop));
2414       return NULL_TREE;
2415
2416     default:
2417       return NULL_TREE;
2418     }
2419 }
2420
2421 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
2422    STMT is printed with a message MSG. */
2423
2424 static void
2425 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2426 {
2427   dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
2428 }
2429
2430 /* DEF_STMT_INFO occurs in a loop that contains a potential reduction
2431    operation.  Return true if the results of DEF_STMT_INFO are something
2432    that can be accumulated by such a reduction.  */
2433
2434 static bool
2435 vect_valid_reduction_input_p (stmt_vec_info def_stmt_info)
2436 {
2437   return (is_gimple_assign (def_stmt_info->stmt)
2438           || is_gimple_call (def_stmt_info->stmt)
2439           || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_induction_def
2440           || (gimple_code (def_stmt_info->stmt) == GIMPLE_PHI
2441               && STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_internal_def
2442               && !is_loop_header_bb_p (gimple_bb (def_stmt_info->stmt))));
2443 }
2444
2445 /* Detect SLP reduction of the form:
2446
2447    #a1 = phi <a5, a0>
2448    a2 = operation (a1)
2449    a3 = operation (a2)
2450    a4 = operation (a3)
2451    a5 = operation (a4)
2452
2453    #a = phi <a5>
2454
2455    PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2456    FIRST_STMT is the first reduction stmt in the chain
2457    (a2 = operation (a1)).
2458
2459    Return TRUE if a reduction chain was detected.  */
2460
2461 static bool
2462 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2463                        gimple *first_stmt)
2464 {
2465   struct loop *loop = (gimple_bb (phi))->loop_father;
2466   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2467   enum tree_code code;
2468   gimple *loop_use_stmt = NULL;
2469   stmt_vec_info use_stmt_info, current_stmt_info = NULL;
2470   tree lhs;
2471   imm_use_iterator imm_iter;
2472   use_operand_p use_p;
2473   int nloop_uses, size = 0, n_out_of_loop_uses;
2474   bool found = false;
2475
2476   if (loop != vect_loop)
2477     return false;
2478
2479   lhs = PHI_RESULT (phi);
2480   code = gimple_assign_rhs_code (first_stmt);
2481   while (1)
2482     {
2483       nloop_uses = 0;
2484       n_out_of_loop_uses = 0;
2485       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2486         {
2487           gimple *use_stmt = USE_STMT (use_p);
2488           if (is_gimple_debug (use_stmt))
2489             continue;
2490
2491           /* Check if we got back to the reduction phi.  */
2492           if (use_stmt == phi)
2493             {
2494               loop_use_stmt = use_stmt;
2495               found = true;
2496               break;
2497             }
2498
2499           if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2500             {
2501               loop_use_stmt = use_stmt;
2502               nloop_uses++;
2503             }
2504            else
2505              n_out_of_loop_uses++;
2506
2507            /* There are can be either a single use in the loop or two uses in
2508               phi nodes.  */
2509            if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2510              return false;
2511         }
2512
2513       if (found)
2514         break;
2515
2516       /* We reached a statement with no loop uses.  */
2517       if (nloop_uses == 0)
2518         return false;
2519
2520       /* This is a loop exit phi, and we haven't reached the reduction phi.  */
2521       if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2522         return false;
2523
2524       if (!is_gimple_assign (loop_use_stmt)
2525           || code != gimple_assign_rhs_code (loop_use_stmt)
2526           || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2527         return false;
2528
2529       /* Insert USE_STMT into reduction chain.  */
2530       use_stmt_info = loop_info->lookup_stmt (loop_use_stmt);
2531       if (current_stmt_info)
2532         {
2533           REDUC_GROUP_NEXT_ELEMENT (current_stmt_info) = use_stmt_info;
2534           REDUC_GROUP_FIRST_ELEMENT (use_stmt_info)
2535             = REDUC_GROUP_FIRST_ELEMENT (current_stmt_info);
2536         }
2537       else
2538         REDUC_GROUP_FIRST_ELEMENT (use_stmt_info) = use_stmt_info;
2539
2540       lhs = gimple_assign_lhs (loop_use_stmt);
2541       current_stmt_info = use_stmt_info;
2542       size++;
2543    }
2544
2545   if (!found || loop_use_stmt != phi || size < 2)
2546     return false;
2547
2548   /* Swap the operands, if needed, to make the reduction operand be the second
2549      operand.  */
2550   lhs = PHI_RESULT (phi);
2551   stmt_vec_info next_stmt_info = REDUC_GROUP_FIRST_ELEMENT (current_stmt_info);
2552   while (next_stmt_info)
2553     {
2554       gassign *next_stmt = as_a <gassign *> (next_stmt_info->stmt);
2555       if (gimple_assign_rhs2 (next_stmt) == lhs)
2556         {
2557           tree op = gimple_assign_rhs1 (next_stmt);
2558           stmt_vec_info def_stmt_info = loop_info->lookup_def (op);
2559
2560           /* Check that the other def is either defined in the loop
2561              ("vect_internal_def"), or it's an induction (defined by a
2562              loop-header phi-node).  */
2563           if (def_stmt_info
2564               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt))
2565               && vect_valid_reduction_input_p (def_stmt_info))
2566             {
2567               lhs = gimple_assign_lhs (next_stmt);
2568               next_stmt_info = REDUC_GROUP_NEXT_ELEMENT (next_stmt_info);
2569               continue;
2570             }
2571
2572           return false;
2573         }
2574       else
2575         {
2576           tree op = gimple_assign_rhs2 (next_stmt);
2577           stmt_vec_info def_stmt_info = loop_info->lookup_def (op);
2578
2579           /* Check that the other def is either defined in the loop
2580             ("vect_internal_def"), or it's an induction (defined by a
2581             loop-header phi-node).  */
2582           if (def_stmt_info
2583               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt))
2584               && vect_valid_reduction_input_p (def_stmt_info))
2585             {
2586               if (dump_enabled_p ())
2587                 dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: %G",
2588                                  next_stmt);
2589
2590               swap_ssa_operands (next_stmt,
2591                                  gimple_assign_rhs1_ptr (next_stmt),
2592                                  gimple_assign_rhs2_ptr (next_stmt));
2593               update_stmt (next_stmt);
2594
2595               if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2596                 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2597             }
2598           else
2599             return false;
2600         }
2601
2602       lhs = gimple_assign_lhs (next_stmt);
2603       next_stmt_info = REDUC_GROUP_NEXT_ELEMENT (next_stmt_info);
2604     }
2605
2606   /* Save the chain for further analysis in SLP detection.  */
2607   stmt_vec_info first_stmt_info
2608     = REDUC_GROUP_FIRST_ELEMENT (current_stmt_info);
2609   LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first_stmt_info);
2610   REDUC_GROUP_SIZE (first_stmt_info) = size;
2611
2612   return true;
2613 }
2614
2615 /* Return true if we need an in-order reduction for operation CODE
2616    on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
2617    overflow must wrap.  */
2618
2619 static bool
2620 needs_fold_left_reduction_p (tree type, tree_code code,
2621                              bool need_wrapping_integral_overflow)
2622 {
2623   /* CHECKME: check for !flag_finite_math_only too?  */
2624   if (SCALAR_FLOAT_TYPE_P (type))
2625     switch (code)
2626       {
2627       case MIN_EXPR:
2628       case MAX_EXPR:
2629         return false;
2630
2631       default:
2632         return !flag_associative_math;
2633       }
2634
2635   if (INTEGRAL_TYPE_P (type))
2636     {
2637       if (!operation_no_trapping_overflow (type, code))
2638         return true;
2639       if (need_wrapping_integral_overflow
2640           && !TYPE_OVERFLOW_WRAPS (type)
2641           && operation_can_overflow (code))
2642         return true;
2643       return false;
2644     }
2645
2646   if (SAT_FIXED_POINT_TYPE_P (type))
2647     return true;
2648
2649   return false;
2650 }
2651
2652 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
2653    reduction operation CODE has a handled computation expression.  */
2654
2655 bool
2656 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
2657                       tree loop_arg, enum tree_code code)
2658 {
2659   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
2660   auto_bitmap visited;
2661   tree lookfor = PHI_RESULT (phi);
2662   ssa_op_iter curri;
2663   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
2664   while (USE_FROM_PTR (curr) != loop_arg)
2665     curr = op_iter_next_use (&curri);
2666   curri.i = curri.numops;
2667   do
2668     {
2669       path.safe_push (std::make_pair (curri, curr));
2670       tree use = USE_FROM_PTR (curr);
2671       if (use == lookfor)
2672         break;
2673       gimple *def = SSA_NAME_DEF_STMT (use);
2674       if (gimple_nop_p (def)
2675           || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
2676         {
2677 pop:
2678           do
2679             {
2680               std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
2681               curri = x.first;
2682               curr = x.second;
2683               do
2684                 curr = op_iter_next_use (&curri);
2685               /* Skip already visited or non-SSA operands (from iterating
2686                  over PHI args).  */
2687               while (curr != NULL_USE_OPERAND_P
2688                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2689                          || ! bitmap_set_bit (visited,
2690                                               SSA_NAME_VERSION
2691                                                 (USE_FROM_PTR (curr)))));
2692             }
2693           while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
2694           if (curr == NULL_USE_OPERAND_P)
2695             break;
2696         }
2697       else
2698         {
2699           if (gimple_code (def) == GIMPLE_PHI)
2700             curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
2701           else
2702             curr = op_iter_init_use (&curri, def, SSA_OP_USE);
2703           while (curr != NULL_USE_OPERAND_P
2704                  && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2705                      || ! bitmap_set_bit (visited,
2706                                           SSA_NAME_VERSION
2707                                             (USE_FROM_PTR (curr)))))
2708             curr = op_iter_next_use (&curri);
2709           if (curr == NULL_USE_OPERAND_P)
2710             goto pop;
2711         }
2712     }
2713   while (1);
2714   if (dump_file && (dump_flags & TDF_DETAILS))
2715     {
2716       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
2717       unsigned i;
2718       std::pair<ssa_op_iter, use_operand_p> *x;
2719       FOR_EACH_VEC_ELT (path, i, x)
2720         dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
2721       dump_printf (MSG_NOTE, "\n");
2722     }
2723
2724   /* Check whether the reduction path detected is valid.  */
2725   bool fail = path.length () == 0;
2726   bool neg = false;
2727   for (unsigned i = 1; i < path.length (); ++i)
2728     {
2729       gimple *use_stmt = USE_STMT (path[i].second);
2730       tree op = USE_FROM_PTR (path[i].second);
2731       if (! has_single_use (op)
2732           || ! is_gimple_assign (use_stmt))
2733         {
2734           fail = true;
2735           break;
2736         }
2737       if (gimple_assign_rhs_code (use_stmt) != code)
2738         {
2739           if (code == PLUS_EXPR
2740               && gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
2741             {
2742               /* Track whether we negate the reduction value each iteration.  */
2743               if (gimple_assign_rhs2 (use_stmt) == op)
2744                 neg = ! neg;
2745             }
2746           else
2747             {
2748               fail = true;
2749               break;
2750             }
2751         }
2752     }
2753   return ! fail && ! neg;
2754 }
2755
2756
2757 /* Function vect_is_simple_reduction
2758
2759    (1) Detect a cross-iteration def-use cycle that represents a simple
2760    reduction computation.  We look for the following pattern:
2761
2762    loop_header:
2763      a1 = phi < a0, a2 >
2764      a3 = ...
2765      a2 = operation (a3, a1)
2766
2767    or
2768
2769    a3 = ...
2770    loop_header:
2771      a1 = phi < a0, a2 >
2772      a2 = operation (a3, a1)
2773
2774    such that:
2775    1. operation is commutative and associative and it is safe to
2776       change the order of the computation
2777    2. no uses for a2 in the loop (a2 is used out of the loop)
2778    3. no uses of a1 in the loop besides the reduction operation
2779    4. no uses of a1 outside the loop.
2780
2781    Conditions 1,4 are tested here.
2782    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2783
2784    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2785    nested cycles.
2786
2787    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2788    reductions:
2789
2790      a1 = phi < a0, a2 >
2791      inner loop (def of a3)
2792      a2 = phi < a3 >
2793
2794    (4) Detect condition expressions, ie:
2795      for (int i = 0; i < N; i++)
2796        if (a[i] < val)
2797         ret_val = a[i];
2798
2799 */
2800
2801 static stmt_vec_info
2802 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
2803                           bool *double_reduc,
2804                           bool need_wrapping_integral_overflow,
2805                           enum vect_reduction_type *v_reduc_type)
2806 {
2807   gphi *phi = as_a <gphi *> (phi_info->stmt);
2808   struct loop *loop = (gimple_bb (phi))->loop_father;
2809   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2810   bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
2811   gimple *phi_use_stmt = NULL;
2812   enum tree_code orig_code, code;
2813   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2814   tree type;
2815   tree name;
2816   imm_use_iterator imm_iter;
2817   use_operand_p use_p;
2818   bool phi_def;
2819
2820   *double_reduc = false;
2821   *v_reduc_type = TREE_CODE_REDUCTION;
2822
2823   tree phi_name = PHI_RESULT (phi);
2824   /* ???  If there are no uses of the PHI result the inner loop reduction
2825      won't be detected as possibly double-reduction by vectorizable_reduction
2826      because that tries to walk the PHI arg from the preheader edge which
2827      can be constant.  See PR60382.  */
2828   if (has_zero_uses (phi_name))
2829     return NULL;
2830   unsigned nphi_def_loop_uses = 0;
2831   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
2832     {
2833       gimple *use_stmt = USE_STMT (use_p);
2834       if (is_gimple_debug (use_stmt))
2835         continue;
2836
2837       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2838         {
2839           if (dump_enabled_p ())
2840             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2841                              "intermediate value used outside loop.\n");
2842
2843           return NULL;
2844         }
2845
2846       nphi_def_loop_uses++;
2847       phi_use_stmt = use_stmt;
2848     }
2849
2850   edge latch_e = loop_latch_edge (loop);
2851   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2852   if (TREE_CODE (loop_arg) != SSA_NAME)
2853     {
2854       if (dump_enabled_p ())
2855         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2856                          "reduction: not ssa_name: %T\n", loop_arg);
2857       return NULL;
2858     }
2859
2860   stmt_vec_info def_stmt_info = loop_info->lookup_def (loop_arg);
2861   if (!def_stmt_info
2862       || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
2863     return NULL;
2864
2865   if (gassign *def_stmt = dyn_cast <gassign *> (def_stmt_info->stmt))
2866     {
2867       name = gimple_assign_lhs (def_stmt);
2868       phi_def = false;
2869     }
2870   else if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
2871     {
2872       name = PHI_RESULT (def_stmt);
2873       phi_def = true;
2874     }
2875   else
2876     {
2877       if (dump_enabled_p ())
2878         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2879                          "reduction: unhandled reduction operation: %G",
2880                          def_stmt_info->stmt);
2881       return NULL;
2882     }
2883
2884   unsigned nlatch_def_loop_uses = 0;
2885   auto_vec<gphi *, 3> lcphis;
2886   bool inner_loop_of_double_reduc = false;
2887   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2888     {
2889       gimple *use_stmt = USE_STMT (use_p);
2890       if (is_gimple_debug (use_stmt))
2891         continue;
2892       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2893         nlatch_def_loop_uses++;
2894       else
2895         {
2896           /* We can have more than one loop-closed PHI.  */
2897           lcphis.safe_push (as_a <gphi *> (use_stmt));
2898           if (nested_in_vect_loop
2899               && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
2900                   == vect_double_reduction_def))
2901             inner_loop_of_double_reduc = true;
2902         }
2903     }
2904
2905   /* If this isn't a nested cycle or if the nested cycle reduction value
2906      is used ouside of the inner loop we cannot handle uses of the reduction
2907      value.  */
2908   if ((!nested_in_vect_loop || inner_loop_of_double_reduc)
2909       && (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1))
2910     {
2911       if (dump_enabled_p ())
2912         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2913                          "reduction used in loop.\n");
2914       return NULL;
2915     }
2916
2917   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2918      defined in the inner loop.  */
2919   if (phi_def)
2920     {
2921       gphi *def_stmt = as_a <gphi *> (def_stmt_info->stmt);
2922       op1 = PHI_ARG_DEF (def_stmt, 0);
2923
2924       if (gimple_phi_num_args (def_stmt) != 1
2925           || TREE_CODE (op1) != SSA_NAME)
2926         {
2927           if (dump_enabled_p ())
2928             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2929                              "unsupported phi node definition.\n");
2930
2931           return NULL;
2932         }
2933
2934       gimple *def1 = SSA_NAME_DEF_STMT (op1);
2935       if (gimple_bb (def1)
2936           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2937           && loop->inner
2938           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2939           && is_gimple_assign (def1)
2940           && is_a <gphi *> (phi_use_stmt)
2941           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
2942         {
2943           if (dump_enabled_p ())
2944             report_vect_op (MSG_NOTE, def_stmt,
2945                             "detected double reduction: ");
2946
2947           *double_reduc = true;
2948           return def_stmt_info;
2949         }
2950
2951       return NULL;
2952     }
2953
2954   /* If we are vectorizing an inner reduction we are executing that
2955      in the original order only in case we are not dealing with a
2956      double reduction.  */
2957   bool check_reduction = true;
2958   if (flow_loop_nested_p (vect_loop, loop))
2959     {
2960       gphi *lcphi;
2961       unsigned i;
2962       check_reduction = false;
2963       FOR_EACH_VEC_ELT (lcphis, i, lcphi)
2964         FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result (lcphi))
2965           {
2966             gimple *use_stmt = USE_STMT (use_p);
2967             if (is_gimple_debug (use_stmt))
2968               continue;
2969             if (! flow_bb_inside_loop_p (vect_loop, gimple_bb (use_stmt)))
2970               check_reduction = true;
2971           }
2972     }
2973
2974   gassign *def_stmt = as_a <gassign *> (def_stmt_info->stmt);
2975   code = orig_code = gimple_assign_rhs_code (def_stmt);
2976
2977   if (nested_in_vect_loop && !check_reduction)
2978     {
2979       /* FIXME: Even for non-reductions code generation is funneled
2980          through vectorizable_reduction for the stmt defining the
2981          PHI latch value.  So we have to artificially restrict ourselves
2982          for the supported operations.  */
2983       switch (get_gimple_rhs_class (code))
2984         {
2985         case GIMPLE_BINARY_RHS:
2986         case GIMPLE_TERNARY_RHS:
2987           break;
2988         default:
2989           /* Not supported by vectorizable_reduction.  */
2990           if (dump_enabled_p ())
2991             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2992                             "nested cycle: not handled operation: ");
2993           return NULL;
2994         }
2995       if (dump_enabled_p ())
2996         report_vect_op (MSG_NOTE, def_stmt, "detected nested cycle: ");
2997       return def_stmt_info;
2998     }
2999
3000   /* We can handle "res -= x[i]", which is non-associative by
3001      simply rewriting this into "res += -x[i]".  Avoid changing
3002      gimple instruction for the first simple tests and only do this
3003      if we're allowed to change code at all.  */
3004   if (code == MINUS_EXPR && gimple_assign_rhs2 (def_stmt) != phi_name)
3005     code = PLUS_EXPR;
3006
3007   if (code == COND_EXPR)
3008     {
3009       if (! nested_in_vect_loop)
3010         *v_reduc_type = COND_REDUCTION;
3011
3012       op3 = gimple_assign_rhs1 (def_stmt);
3013       if (COMPARISON_CLASS_P (op3))
3014         {
3015           op4 = TREE_OPERAND (op3, 1);
3016           op3 = TREE_OPERAND (op3, 0);
3017         }
3018       if (op3 == phi_name || op4 == phi_name)
3019         {
3020           if (dump_enabled_p ())
3021             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3022                             "reduction: condition depends on previous"
3023                             " iteration: ");
3024           return NULL;
3025         }
3026
3027       op1 = gimple_assign_rhs2 (def_stmt);
3028       op2 = gimple_assign_rhs3 (def_stmt);
3029     }
3030   else if (!commutative_tree_code (code) || !associative_tree_code (code))
3031     {
3032       if (dump_enabled_p ())
3033         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3034                         "reduction: not commutative/associative: ");
3035       return NULL;
3036     }
3037   else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS)
3038     {
3039       op1 = gimple_assign_rhs1 (def_stmt);
3040       op2 = gimple_assign_rhs2 (def_stmt);
3041     }
3042   else
3043     {
3044       if (dump_enabled_p ())
3045         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3046                         "reduction: not handled operation: ");
3047       return NULL;
3048     }
3049
3050   if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
3051     {
3052       if (dump_enabled_p ())
3053         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3054                         "reduction: both uses not ssa_names: ");
3055
3056       return NULL;
3057     }
3058
3059   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
3060   if ((TREE_CODE (op1) == SSA_NAME
3061        && !types_compatible_p (type,TREE_TYPE (op1)))
3062       || (TREE_CODE (op2) == SSA_NAME
3063           && !types_compatible_p (type, TREE_TYPE (op2)))
3064       || (op3 && TREE_CODE (op3) == SSA_NAME
3065           && !types_compatible_p (type, TREE_TYPE (op3)))
3066       || (op4 && TREE_CODE (op4) == SSA_NAME
3067           && !types_compatible_p (type, TREE_TYPE (op4))))
3068     {
3069       if (dump_enabled_p ())
3070         {
3071           dump_printf_loc (MSG_NOTE, vect_location,
3072                            "reduction: multiple types: operation type: "
3073                            "%T, operands types: %T,%T",
3074                            type,  TREE_TYPE (op1), TREE_TYPE (op2));
3075           if (op3)
3076             dump_printf (MSG_NOTE, ",%T", TREE_TYPE (op3));
3077
3078           if (op4)
3079             dump_printf (MSG_NOTE, ",%T", TREE_TYPE (op4));
3080           dump_printf (MSG_NOTE, "\n");
3081         }
3082
3083       return NULL;
3084     }
3085
3086   /* Check whether it's ok to change the order of the computation.
3087      Generally, when vectorizing a reduction we change the order of the
3088      computation.  This may change the behavior of the program in some
3089      cases, so we need to check that this is ok.  One exception is when
3090      vectorizing an outer-loop: the inner-loop is executed sequentially,
3091      and therefore vectorizing reductions in the inner-loop during
3092      outer-loop vectorization is safe.  */
3093   if (check_reduction
3094       && *v_reduc_type == TREE_CODE_REDUCTION
3095       && needs_fold_left_reduction_p (type, code,
3096                                       need_wrapping_integral_overflow))
3097     *v_reduc_type = FOLD_LEFT_REDUCTION;
3098
3099   /* Reduction is safe. We're dealing with one of the following:
3100      1) integer arithmetic and no trapv
3101      2) floating point arithmetic, and special flags permit this optimization
3102      3) nested cycle (i.e., outer loop vectorization).  */
3103   stmt_vec_info def1_info = loop_info->lookup_def (op1);
3104   stmt_vec_info def2_info = loop_info->lookup_def (op2);
3105   if (code != COND_EXPR && !def1_info && !def2_info)
3106     {
3107       if (dump_enabled_p ())
3108         report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
3109       return NULL;
3110     }
3111
3112   /* Check that one def is the reduction def, defined by PHI,
3113      the other def is either defined in the loop ("vect_internal_def"),
3114      or it's an induction (defined by a loop-header phi-node).  */
3115
3116   if (def2_info
3117       && def2_info->stmt == phi
3118       && (code == COND_EXPR
3119           || !def1_info
3120           || !flow_bb_inside_loop_p (loop, gimple_bb (def1_info->stmt))
3121           || vect_valid_reduction_input_p (def1_info)))
3122     {
3123       if (dump_enabled_p ())
3124         report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3125       return def_stmt_info;
3126     }
3127
3128   if (def1_info
3129       && def1_info->stmt == phi
3130       && (code == COND_EXPR
3131           || !def2_info
3132           || !flow_bb_inside_loop_p (loop, gimple_bb (def2_info->stmt))
3133           || vect_valid_reduction_input_p (def2_info)))
3134     {
3135       if (! nested_in_vect_loop && orig_code != MINUS_EXPR)
3136         {
3137           /* Check if we can swap operands (just for simplicity - so that
3138              the rest of the code can assume that the reduction variable
3139              is always the last (second) argument).  */
3140           if (code == COND_EXPR)
3141             {
3142               /* Swap cond_expr by inverting the condition.  */
3143               tree cond_expr = gimple_assign_rhs1 (def_stmt);
3144               enum tree_code invert_code = ERROR_MARK;
3145               enum tree_code cond_code = TREE_CODE (cond_expr);
3146
3147               if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
3148                 {
3149                   bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
3150                   invert_code = invert_tree_comparison (cond_code, honor_nans);
3151                 }
3152               if (invert_code != ERROR_MARK)
3153                 {
3154                   TREE_SET_CODE (cond_expr, invert_code);
3155                   swap_ssa_operands (def_stmt,
3156                                      gimple_assign_rhs2_ptr (def_stmt),
3157                                      gimple_assign_rhs3_ptr (def_stmt));
3158                 }
3159               else
3160                 {
3161                   if (dump_enabled_p ())
3162                     report_vect_op (MSG_NOTE, def_stmt,
3163                                     "detected reduction: cannot swap operands "
3164                                     "for cond_expr");
3165                   return NULL;
3166                 }
3167             }
3168           else
3169             swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
3170                                gimple_assign_rhs2_ptr (def_stmt));
3171
3172           if (dump_enabled_p ())
3173             report_vect_op (MSG_NOTE, def_stmt,
3174                             "detected reduction: need to swap operands: ");
3175
3176           if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
3177             LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
3178         }
3179       else
3180         {
3181           if (dump_enabled_p ())
3182             report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3183         }
3184
3185       return def_stmt_info;
3186     }
3187
3188   /* Try to find SLP reduction chain.  */
3189   if (! nested_in_vect_loop
3190       && code != COND_EXPR
3191       && orig_code != MINUS_EXPR
3192       && vect_is_slp_reduction (loop_info, phi, def_stmt))
3193     {
3194       if (dump_enabled_p ())
3195         report_vect_op (MSG_NOTE, def_stmt,
3196                         "reduction: detected reduction chain: ");
3197
3198       return def_stmt_info;
3199     }
3200
3201   /* Dissolve group eventually half-built by vect_is_slp_reduction.  */
3202   stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (def_stmt_info);
3203   while (first)
3204     {
3205       stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
3206       REDUC_GROUP_FIRST_ELEMENT (first) = NULL;
3207       REDUC_GROUP_NEXT_ELEMENT (first) = NULL;
3208       first = next;
3209     }
3210
3211   /* Look for the expression computing loop_arg from loop PHI result.  */
3212   if (check_reduction_path (vect_location, loop, phi, loop_arg, code))
3213     return def_stmt_info;
3214
3215   if (dump_enabled_p ())
3216     {
3217       report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3218                       "reduction: unknown pattern: ");
3219     }
3220
3221   return NULL;
3222 }
3223
3224 /* Wrapper around vect_is_simple_reduction, which will modify code
3225    in-place if it enables detection of more reductions.  Arguments
3226    as there.  */
3227
3228 stmt_vec_info
3229 vect_force_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3230                              bool *double_reduc,
3231                              bool need_wrapping_integral_overflow)
3232 {
3233   enum vect_reduction_type v_reduc_type;
3234   stmt_vec_info def_info
3235     = vect_is_simple_reduction (loop_info, phi_info, double_reduc,
3236                                 need_wrapping_integral_overflow,
3237                                 &v_reduc_type);
3238   if (def_info)
3239     {
3240       STMT_VINFO_REDUC_TYPE (phi_info) = v_reduc_type;
3241       STMT_VINFO_REDUC_DEF (phi_info) = def_info;
3242       STMT_VINFO_REDUC_TYPE (def_info) = v_reduc_type;
3243       STMT_VINFO_REDUC_DEF (def_info) = phi_info;
3244     }
3245   return def_info;
3246 }
3247
3248 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
3249 int
3250 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3251                              int *peel_iters_epilogue,
3252                              stmt_vector_for_cost *scalar_cost_vec,
3253                              stmt_vector_for_cost *prologue_cost_vec,
3254                              stmt_vector_for_cost *epilogue_cost_vec)
3255 {
3256   int retval = 0;
3257   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3258
3259   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3260     {
3261       *peel_iters_epilogue = assumed_vf / 2;
3262       if (dump_enabled_p ())
3263         dump_printf_loc (MSG_NOTE, vect_location,
3264                          "cost model: epilogue peel iters set to vf/2 "
3265                          "because loop iterations are unknown .\n");
3266
3267       /* If peeled iterations are known but number of scalar loop
3268          iterations are unknown, count a taken branch per peeled loop.  */
3269       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3270                                  NULL, 0, vect_prologue);
3271       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3272                                  NULL, 0, vect_epilogue);
3273     }
3274   else
3275     {
3276       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3277       peel_iters_prologue = niters < peel_iters_prologue ?
3278                             niters : peel_iters_prologue;
3279       *peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3280       /* If we need to peel for gaps, but no peeling is required, we have to
3281          peel VF iterations.  */
3282       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3283         *peel_iters_epilogue = assumed_vf;
3284     }
3285
3286   stmt_info_for_cost *si;
3287   int j;
3288   if (peel_iters_prologue)
3289     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3290       retval += record_stmt_cost (prologue_cost_vec,
3291                                   si->count * peel_iters_prologue,
3292                                   si->kind, si->stmt_info, si->misalign,
3293                                   vect_prologue);
3294   if (*peel_iters_epilogue)
3295     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3296       retval += record_stmt_cost (epilogue_cost_vec,
3297                                   si->count * *peel_iters_epilogue,
3298                                   si->kind, si->stmt_info, si->misalign,
3299                                   vect_epilogue);
3300
3301   return retval;
3302 }
3303
3304 /* Function vect_estimate_min_profitable_iters
3305
3306    Return the number of iterations required for the vector version of the
3307    loop to be profitable relative to the cost of the scalar version of the
3308    loop.
3309
3310    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3311    of iterations for vectorization.  -1 value means loop vectorization
3312    is not profitable.  This returned value may be used for dynamic
3313    profitability check.
3314
3315    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3316    for static check against estimated number of iterations.  */
3317
3318 static void
3319 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3320                                     int *ret_min_profitable_niters,
3321                                     int *ret_min_profitable_estimate)
3322 {
3323   int min_profitable_iters;
3324   int min_profitable_estimate;
3325   int peel_iters_prologue;
3326   int peel_iters_epilogue;
3327   unsigned vec_inside_cost = 0;
3328   int vec_outside_cost = 0;
3329   unsigned vec_prologue_cost = 0;
3330   unsigned vec_epilogue_cost = 0;
3331   int scalar_single_iter_cost = 0;
3332   int scalar_outside_cost = 0;
3333   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3334   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3335   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3336
3337   /* Cost model disabled.  */
3338   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3339     {
3340       dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3341       *ret_min_profitable_niters = 0;
3342       *ret_min_profitable_estimate = 0;
3343       return;
3344     }
3345
3346   /* Requires loop versioning tests to handle misalignment.  */
3347   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3348     {
3349       /*  FIXME: Make cost depend on complexity of individual check.  */
3350       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3351       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3352                             vect_prologue);
3353       dump_printf (MSG_NOTE,
3354                    "cost model: Adding cost of checks for loop "
3355                    "versioning to treat misalignment.\n");
3356     }
3357
3358   /* Requires loop versioning with alias checks.  */
3359   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3360     {
3361       /*  FIXME: Make cost depend on complexity of individual check.  */
3362       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3363       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3364                             vect_prologue);
3365       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3366       if (len)
3367         /* Count LEN - 1 ANDs and LEN comparisons.  */
3368         (void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt,
3369                               NULL, 0, vect_prologue);
3370       len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3371       if (len)
3372         {
3373           /* Count LEN - 1 ANDs and LEN comparisons.  */
3374           unsigned int nstmts = len * 2 - 1;
3375           /* +1 for each bias that needs adding.  */
3376           for (unsigned int i = 0; i < len; ++i)
3377             if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3378               nstmts += 1;
3379           (void) add_stmt_cost (target_cost_data, nstmts, scalar_stmt,
3380                                 NULL, 0, vect_prologue);
3381         }
3382       dump_printf (MSG_NOTE,
3383                    "cost model: Adding cost of checks for loop "
3384                    "versioning aliasing.\n");
3385     }
3386
3387   /* Requires loop versioning with niter checks.  */
3388   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3389     {
3390       /*  FIXME: Make cost depend on complexity of individual check.  */
3391       (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3392                             vect_prologue);
3393       dump_printf (MSG_NOTE,
3394                    "cost model: Adding cost of checks for loop "
3395                    "versioning niters.\n");
3396     }
3397
3398   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3399     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3400                           vect_prologue);
3401
3402   /* Count statements in scalar loop.  Using this as scalar cost for a single
3403      iteration for now.
3404
3405      TODO: Add outer loop support.
3406
3407      TODO: Consider assigning different costs to different scalar
3408      statements.  */
3409
3410   scalar_single_iter_cost
3411     = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3412
3413   /* Add additional cost for the peeled instructions in prologue and epilogue
3414      loop.  (For fully-masked loops there will be no peeling.)
3415
3416      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3417      at compile-time - we assume it's vf/2 (the worst would be vf-1).
3418
3419      TODO: Build an expression that represents peel_iters for prologue and
3420      epilogue to be used in a run-time test.  */
3421
3422   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3423     {
3424       peel_iters_prologue = 0;
3425       peel_iters_epilogue = 0;
3426
3427       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3428         {
3429           /* We need to peel exactly one iteration.  */
3430           peel_iters_epilogue += 1;
3431           stmt_info_for_cost *si;
3432           int j;
3433           FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
3434                             j, si)
3435             (void) add_stmt_cost (target_cost_data, si->count,
3436                                   si->kind, si->stmt_info, si->misalign,
3437                                   vect_epilogue);
3438         }
3439     }
3440   else if (npeel < 0)
3441     {
3442       peel_iters_prologue = assumed_vf / 2;
3443       dump_printf (MSG_NOTE, "cost model: "
3444                    "prologue peel iters set to vf/2.\n");
3445
3446       /* If peeling for alignment is unknown, loop bound of main loop becomes
3447          unknown.  */
3448       peel_iters_epilogue = assumed_vf / 2;
3449       dump_printf (MSG_NOTE, "cost model: "
3450                    "epilogue peel iters set to vf/2 because "
3451                    "peeling for alignment is unknown.\n");
3452
3453       /* If peeled iterations are unknown, count a taken branch and a not taken
3454          branch per peeled loop. Even if scalar loop iterations are known,
3455          vector iterations are not known since peeled prologue iterations are
3456          not known. Hence guards remain the same.  */
3457       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3458                             NULL, 0, vect_prologue);
3459       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3460                             NULL, 0, vect_prologue);
3461       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3462                             NULL, 0, vect_epilogue);
3463       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3464                             NULL, 0, vect_epilogue);
3465       stmt_info_for_cost *si;
3466       int j;
3467       FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3468         {
3469           (void) add_stmt_cost (target_cost_data,
3470                                 si->count * peel_iters_prologue,
3471                                 si->kind, si->stmt_info, si->misalign,
3472                                 vect_prologue);
3473           (void) add_stmt_cost (target_cost_data,
3474                                 si->count * peel_iters_epilogue,
3475                                 si->kind, si->stmt_info, si->misalign,
3476                                 vect_epilogue);
3477         }
3478     }
3479   else
3480     {
3481       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3482       stmt_info_for_cost *si;
3483       int j;
3484       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3485
3486       prologue_cost_vec.create (2);
3487       epilogue_cost_vec.create (2);
3488       peel_iters_prologue = npeel;
3489
3490       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3491                                           &peel_iters_epilogue,
3492                                           &LOOP_VINFO_SCALAR_ITERATION_COST
3493                                             (loop_vinfo),
3494                                           &prologue_cost_vec,
3495                                           &epilogue_cost_vec);
3496
3497       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3498         (void) add_stmt_cost (data, si->count, si->kind, si->stmt_info,
3499                               si->misalign, vect_prologue);
3500
3501       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3502         (void) add_stmt_cost (data, si->count, si->kind, si->stmt_info,
3503                               si->misalign, vect_epilogue);
3504
3505       prologue_cost_vec.release ();
3506       epilogue_cost_vec.release ();
3507     }
3508
3509   /* FORNOW: The scalar outside cost is incremented in one of the
3510      following ways:
3511
3512      1. The vectorizer checks for alignment and aliasing and generates
3513      a condition that allows dynamic vectorization.  A cost model
3514      check is ANDED with the versioning condition.  Hence scalar code
3515      path now has the added cost of the versioning check.
3516
3517        if (cost > th & versioning_check)
3518          jmp to vector code
3519
3520      Hence run-time scalar is incremented by not-taken branch cost.
3521
3522      2. The vectorizer then checks if a prologue is required.  If the
3523      cost model check was not done before during versioning, it has to
3524      be done before the prologue check.
3525
3526        if (cost <= th)
3527          prologue = scalar_iters
3528        if (prologue == 0)
3529          jmp to vector code
3530        else
3531          execute prologue
3532        if (prologue == num_iters)
3533          go to exit
3534
3535      Hence the run-time scalar cost is incremented by a taken branch,
3536      plus a not-taken branch, plus a taken branch cost.
3537
3538      3. The vectorizer then checks if an epilogue is required.  If the
3539      cost model check was not done before during prologue check, it
3540      has to be done with the epilogue check.
3541
3542        if (prologue == 0)
3543          jmp to vector code
3544        else
3545          execute prologue
3546        if (prologue == num_iters)
3547          go to exit
3548        vector code:
3549          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3550            jmp to epilogue
3551
3552      Hence the run-time scalar cost should be incremented by 2 taken
3553      branches.
3554
3555      TODO: The back end may reorder the BBS's differently and reverse
3556      conditions/branch directions.  Change the estimates below to
3557      something more reasonable.  */
3558
3559   /* If the number of iterations is known and we do not do versioning, we can
3560      decide whether to vectorize at compile time.  Hence the scalar version
3561      do not carry cost model guard costs.  */
3562   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3563       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3564     {
3565       /* Cost model check occurs at versioning.  */
3566       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3567         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3568       else
3569         {
3570           /* Cost model check occurs at prologue generation.  */
3571           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3572             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3573               + vect_get_stmt_cost (cond_branch_not_taken);
3574           /* Cost model check occurs at epilogue generation.  */
3575           else
3576             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3577         }
3578     }
3579
3580   /* Complete the target-specific cost calculations.  */
3581   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3582                &vec_inside_cost, &vec_epilogue_cost);
3583
3584   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3585
3586   if (dump_enabled_p ())
3587     {
3588       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3589       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
3590                    vec_inside_cost);
3591       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
3592                    vec_prologue_cost);
3593       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
3594                    vec_epilogue_cost);
3595       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
3596                    scalar_single_iter_cost);
3597       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
3598                    scalar_outside_cost);
3599       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
3600                    vec_outside_cost);
3601       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
3602                    peel_iters_prologue);
3603       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
3604                    peel_iters_epilogue);
3605     }
3606
3607   /* Calculate number of iterations required to make the vector version
3608      profitable, relative to the loop bodies only.  The following condition
3609      must hold true:
3610      SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
3611      where
3612      SIC = scalar iteration cost, VIC = vector iteration cost,
3613      VOC = vector outside cost, VF = vectorization factor,
3614      PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
3615      SOC = scalar outside cost for run time cost model check.  */
3616
3617   if ((scalar_single_iter_cost * assumed_vf) > (int) vec_inside_cost)
3618     {
3619       min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
3620                               * assumed_vf
3621                               - vec_inside_cost * peel_iters_prologue
3622                               - vec_inside_cost * peel_iters_epilogue);
3623       if (min_profitable_iters <= 0)
3624         min_profitable_iters = 0;
3625       else
3626         {
3627           min_profitable_iters /= ((scalar_single_iter_cost * assumed_vf)
3628                                    - vec_inside_cost);
3629
3630           if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
3631               <= (((int) vec_inside_cost * min_profitable_iters)
3632                   + (((int) vec_outside_cost - scalar_outside_cost)
3633                      * assumed_vf)))
3634             min_profitable_iters++;
3635         }
3636     }
3637   /* vector version will never be profitable.  */
3638   else
3639     {
3640       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3641         warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
3642                     "vectorization did not happen for a simd loop");
3643
3644       if (dump_enabled_p ())
3645         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3646                          "cost model: the vector iteration cost = %d "
3647                          "divided by the scalar iteration cost = %d "
3648                          "is greater or equal to the vectorization factor = %d"
3649                          ".\n",
3650                          vec_inside_cost, scalar_single_iter_cost, assumed_vf);
3651       *ret_min_profitable_niters = -1;
3652       *ret_min_profitable_estimate = -1;
3653       return;
3654     }
3655
3656   dump_printf (MSG_NOTE,
3657                "  Calculated minimum iters for profitability: %d\n",
3658                min_profitable_iters);
3659
3660   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
3661       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
3662     /* We want the vectorized loop to execute at least once.  */
3663     min_profitable_iters = assumed_vf + peel_iters_prologue;
3664
3665   if (dump_enabled_p ())
3666     dump_printf_loc (MSG_NOTE, vect_location,
3667                      "  Runtime profitability threshold = %d\n",
3668                      min_profitable_iters);
3669
3670   *ret_min_profitable_niters = min_profitable_iters;
3671
3672   /* Calculate number of iterations required to make the vector version
3673      profitable, relative to the loop bodies only.
3674
3675      Non-vectorized variant is SIC * niters and it must win over vector
3676      variant on the expected loop trip count.  The following condition must hold true:
3677      SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC  */
3678
3679   if (vec_outside_cost <= 0)
3680     min_profitable_estimate = 0;
3681   else
3682     {
3683       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
3684                                  * assumed_vf
3685                                  - vec_inside_cost * peel_iters_prologue
3686                                  - vec_inside_cost * peel_iters_epilogue)
3687                                  / ((scalar_single_iter_cost * assumed_vf)
3688                                    - vec_inside_cost);
3689     }
3690   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3691   if (dump_enabled_p ())
3692     dump_printf_loc (MSG_NOTE, vect_location,
3693                      "  Static estimate profitability threshold = %d\n",
3694                      min_profitable_estimate);
3695
3696   *ret_min_profitable_estimate = min_profitable_estimate;
3697 }
3698
3699 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3700    vector elements (not bits) for a vector with NELT elements.  */
3701 static void
3702 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
3703                               vec_perm_builder *sel)
3704 {
3705   /* The encoding is a single stepped pattern.  Any wrap-around is handled
3706      by vec_perm_indices.  */
3707   sel->new_vector (nelt, 1, 3);
3708   for (unsigned int i = 0; i < 3; i++)
3709     sel->quick_push (i + offset);
3710 }
3711
3712 /* Checks whether the target supports whole-vector shifts for vectors of mode
3713    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
3714    it supports vec_perm_const with masks for all necessary shift amounts.  */
3715 static bool
3716 have_whole_vector_shift (machine_mode mode)
3717 {
3718   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3719     return true;
3720
3721   /* Variable-length vectors should be handled via the optab.  */
3722   unsigned int nelt;
3723   if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
3724     return false;
3725
3726   vec_perm_builder sel;
3727   vec_perm_indices indices;
3728   for (unsigned int i = nelt / 2; i >= 1; i /= 2)
3729     {
3730       calc_vec_perm_mask_for_shift (i, nelt, &sel);
3731       indices.new_vector (sel, 2, nelt);
3732       if (!can_vec_perm_const_p (mode, indices, false))
3733         return false;
3734     }
3735   return true;
3736 }
3737
3738 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3739    functions. Design better to avoid maintenance issues.  */
3740
3741 /* Function vect_model_reduction_cost.
3742
3743    Models cost for a reduction operation, including the vector ops
3744    generated within the strip-mine loop, the initial definition before
3745    the loop, and the epilogue code that must be generated.  */
3746
3747 static void
3748 vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
3749                            int ncopies, stmt_vector_for_cost *cost_vec)
3750 {
3751   int prologue_cost = 0, epilogue_cost = 0, inside_cost;
3752   enum tree_code code;
3753   optab optab;
3754   tree vectype;
3755   machine_mode mode;
3756   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3757   struct loop *loop = NULL;
3758
3759   if (loop_vinfo)
3760     loop = LOOP_VINFO_LOOP (loop_vinfo);
3761
3762   /* Condition reductions generate two reductions in the loop.  */
3763   vect_reduction_type reduction_type
3764     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
3765   if (reduction_type == COND_REDUCTION)
3766     ncopies *= 2;
3767
3768   vectype = STMT_VINFO_VECTYPE (stmt_info);
3769   mode = TYPE_MODE (vectype);
3770   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
3771
3772   code = gimple_assign_rhs_code (orig_stmt_info->stmt);
3773
3774   if (reduction_type == EXTRACT_LAST_REDUCTION
3775       || reduction_type == FOLD_LEFT_REDUCTION)
3776     {
3777       /* No extra instructions needed in the prologue.  */
3778       prologue_cost = 0;
3779
3780       if (reduction_type == EXTRACT_LAST_REDUCTION || reduc_fn != IFN_LAST)
3781         /* Count one reduction-like operation per vector.  */
3782         inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
3783                                         stmt_info, 0, vect_body);
3784       else
3785         {
3786           /* Use NELEMENTS extracts and NELEMENTS scalar ops.  */
3787           unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
3788           inside_cost = record_stmt_cost (cost_vec, nelements,
3789                                           vec_to_scalar, stmt_info, 0,
3790                                           vect_body);
3791           inside_cost += record_stmt_cost (cost_vec, nelements,
3792                                            scalar_stmt, stmt_info, 0,
3793                                            vect_body);
3794         }
3795     }
3796   else
3797     {
3798       /* Add in cost for initial definition.
3799          For cond reduction we have four vectors: initial index, step,
3800          initial result of the data reduction, initial value of the index
3801          reduction.  */
3802       int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
3803       prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
3804                                          scalar_to_vec, stmt_info, 0,
3805                                          vect_prologue);
3806
3807       /* Cost of reduction op inside loop.  */
3808       inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
3809                                       stmt_info, 0, vect_body);
3810     }
3811
3812   /* Determine cost of epilogue code.
3813
3814      We have a reduction operator that will reduce the vector in one statement.
3815      Also requires scalar extract.  */
3816
3817   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
3818     {
3819       if (reduc_fn != IFN_LAST)
3820         {
3821           if (reduction_type == COND_REDUCTION)
3822             {
3823               /* An EQ stmt and an COND_EXPR stmt.  */
3824               epilogue_cost += record_stmt_cost (cost_vec, 2,
3825                                                  vector_stmt, stmt_info, 0,
3826                                                  vect_epilogue);
3827               /* Reduction of the max index and a reduction of the found
3828                  values.  */
3829               epilogue_cost += record_stmt_cost (cost_vec, 2,
3830                                                  vec_to_scalar, stmt_info, 0,
3831                                                  vect_epilogue);
3832               /* A broadcast of the max value.  */
3833               epilogue_cost += record_stmt_cost (cost_vec, 1,
3834                                                  scalar_to_vec, stmt_info, 0,
3835                                                  vect_epilogue);
3836             }
3837           else
3838             {
3839               epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
3840                                                  stmt_info, 0, vect_epilogue);
3841               epilogue_cost += record_stmt_cost (cost_vec, 1,
3842                                                  vec_to_scalar, stmt_info, 0,
3843                                                  vect_epilogue);
3844             }
3845         }
3846       else if (reduction_type == COND_REDUCTION)
3847         {
3848           unsigned estimated_nunits = vect_nunits_for_cost (vectype);
3849           /* Extraction of scalar elements.  */
3850           epilogue_cost += record_stmt_cost (cost_vec,
3851                                              2 * estimated_nunits,
3852                                              vec_to_scalar, stmt_info, 0,
3853                                              vect_epilogue);
3854           /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
3855           epilogue_cost += record_stmt_cost (cost_vec,
3856                                              2 * estimated_nunits - 3,
3857                                              scalar_stmt, stmt_info, 0,
3858                                              vect_epilogue);
3859         }
3860       else if (reduction_type == EXTRACT_LAST_REDUCTION
3861                || reduction_type == FOLD_LEFT_REDUCTION)
3862         /* No extra instructions need in the epilogue.  */
3863         ;
3864       else
3865         {
3866           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3867           tree bitsize =
3868             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt_info->stmt)));
3869           int element_bitsize = tree_to_uhwi (bitsize);
3870           int nelements = vec_size_in_bits / element_bitsize;
3871
3872           if (code == COND_EXPR)
3873             code = MAX_EXPR;
3874
3875           optab = optab_for_tree_code (code, vectype, optab_default);
3876
3877           /* We have a whole vector shift available.  */
3878           if (optab != unknown_optab
3879               && VECTOR_MODE_P (mode)
3880               && optab_handler (optab, mode) != CODE_FOR_nothing
3881               && have_whole_vector_shift (mode))
3882             {
3883               /* Final reduction via vector shifts and the reduction operator.
3884                  Also requires scalar extract.  */
3885               epilogue_cost += record_stmt_cost (cost_vec,
3886                                                  exact_log2 (nelements) * 2,
3887                                                  vector_stmt, stmt_info, 0,
3888                                                  vect_epilogue);
3889               epilogue_cost += record_stmt_cost (cost_vec, 1,
3890                                                  vec_to_scalar, stmt_info, 0,
3891                                                  vect_epilogue);
3892             }
3893           else
3894             /* Use extracts and reduction op for final reduction.  For N
3895                elements, we have N extracts and N-1 reduction ops.  */
3896             epilogue_cost += record_stmt_cost (cost_vec,
3897                                                nelements + nelements - 1,
3898                                                vector_stmt, stmt_info, 0,
3899                                                vect_epilogue);
3900         }
3901     }
3902
3903   if (dump_enabled_p ())
3904     dump_printf (MSG_NOTE,
3905                  "vect_model_reduction_cost: inside_cost = %d, "
3906                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
3907                  prologue_cost, epilogue_cost);
3908 }
3909
3910
3911 /* Function vect_model_induction_cost.
3912
3913    Models cost for induction operations.  */
3914
3915 static void
3916 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies,
3917                            stmt_vector_for_cost *cost_vec)
3918 {
3919   unsigned inside_cost, prologue_cost;
3920
3921   if (PURE_SLP_STMT (stmt_info))
3922     return;
3923
3924   /* loop cost for vec_loop.  */
3925   inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
3926                                   stmt_info, 0, vect_body);
3927
3928   /* prologue cost for vec_init and vec_step.  */
3929   prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
3930                                     stmt_info, 0, vect_prologue);
3931
3932   if (dump_enabled_p ())
3933     dump_printf_loc (MSG_NOTE, vect_location,
3934                      "vect_model_induction_cost: inside_cost = %d, "
3935                      "prologue_cost = %d .\n", inside_cost, prologue_cost);
3936 }
3937
3938
3939
3940 /* Function get_initial_def_for_reduction
3941
3942    Input:
3943    STMT_VINFO - a stmt that performs a reduction operation in the loop.
3944    INIT_VAL - the initial value of the reduction variable
3945
3946    Output:
3947    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3948         of the reduction (used for adjusting the epilog - see below).
3949    Return a vector variable, initialized according to the operation that
3950         STMT_VINFO performs. This vector will be used as the initial value
3951         of the vector of partial results.
3952
3953    Option1 (adjust in epilog): Initialize the vector as follows:
3954      add/bit or/xor:    [0,0,...,0,0]
3955      mult/bit and:      [1,1,...,1,1]
3956      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3957    and when necessary (e.g. add/mult case) let the caller know
3958    that it needs to adjust the result by init_val.
3959
3960    Option2: Initialize the vector as follows:
3961      add/bit or/xor:    [init_val,0,0,...,0]
3962      mult/bit and:      [init_val,1,1,...,1]
3963      min/max/cond_expr: [init_val,init_val,...,init_val]
3964    and no adjustments are needed.
3965
3966    For example, for the following code:
3967
3968    s = init_val;
3969    for (i=0;i<n;i++)
3970      s = s + a[i];
3971
3972    STMT_VINFO is 's = s + a[i]', and the reduction variable is 's'.
3973    For a vector of 4 units, we want to return either [0,0,0,init_val],
3974    or [0,0,0,0] and let the caller know that it needs to adjust
3975    the result at the end by 'init_val'.
3976
3977    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3978    initialization vector is simpler (same element in all entries), if
3979    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3980
3981    A cost model should help decide between these two schemes.  */
3982
3983 tree
3984 get_initial_def_for_reduction (stmt_vec_info stmt_vinfo, tree init_val,
3985                                tree *adjustment_def)
3986 {
3987   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3988   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3989   tree scalar_type = TREE_TYPE (init_val);
3990   tree vectype = get_vectype_for_scalar_type (scalar_type);
3991   enum tree_code code = gimple_assign_rhs_code (stmt_vinfo->stmt);
3992   tree def_for_init;
3993   tree init_def;
3994   REAL_VALUE_TYPE real_init_val = dconst0;
3995   int int_init_val = 0;
3996   gimple_seq stmts = NULL;
3997
3998   gcc_assert (vectype);
3999
4000   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4001               || SCALAR_FLOAT_TYPE_P (scalar_type));
4002
4003   gcc_assert (nested_in_vect_loop_p (loop, stmt_vinfo)
4004               || loop == (gimple_bb (stmt_vinfo->stmt))->loop_father);
4005
4006   vect_reduction_type reduction_type
4007     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo);
4008
4009   switch (code)
4010     {
4011     case WIDEN_SUM_EXPR:
4012     case DOT_PROD_EXPR:
4013     case SAD_EXPR:
4014     case PLUS_EXPR:
4015     case MINUS_EXPR:
4016     case BIT_IOR_EXPR:
4017     case BIT_XOR_EXPR:
4018     case MULT_EXPR:
4019     case BIT_AND_EXPR:
4020       {
4021         /* ADJUSTMENT_DEF is NULL when called from
4022            vect_create_epilog_for_reduction to vectorize double reduction.  */
4023         if (adjustment_def)
4024           *adjustment_def = init_val;
4025
4026         if (code == MULT_EXPR)
4027           {
4028             real_init_val = dconst1;
4029             int_init_val = 1;
4030           }
4031
4032         if (code == BIT_AND_EXPR)
4033           int_init_val = -1;
4034
4035         if (SCALAR_FLOAT_TYPE_P (scalar_type))
4036           def_for_init = build_real (scalar_type, real_init_val);
4037         else
4038           def_for_init = build_int_cst (scalar_type, int_init_val);
4039
4040         if (adjustment_def)
4041           /* Option1: the first element is '0' or '1' as well.  */
4042           init_def = gimple_build_vector_from_val (&stmts, vectype,
4043                                                    def_for_init);
4044         else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4045           {
4046             /* Option2 (variable length): the first element is INIT_VAL.  */
4047             init_def = gimple_build_vector_from_val (&stmts, vectype,
4048                                                      def_for_init);
4049             init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4050                                      vectype, init_def, init_val);
4051           }
4052         else
4053           {
4054             /* Option2: the first element is INIT_VAL.  */
4055             tree_vector_builder elts (vectype, 1, 2);
4056             elts.quick_push (init_val);
4057             elts.quick_push (def_for_init);
4058             init_def = gimple_build_vector (&stmts, &elts);
4059           }
4060       }
4061       break;
4062
4063     case MIN_EXPR:
4064     case MAX_EXPR:
4065     case COND_EXPR:
4066       {
4067         if (adjustment_def)
4068           {
4069             *adjustment_def = NULL_TREE;
4070             if (reduction_type != COND_REDUCTION
4071                 && reduction_type != EXTRACT_LAST_REDUCTION)
4072               {
4073                 init_def = vect_get_vec_def_for_operand (init_val, stmt_vinfo);
4074                 break;
4075               }
4076           }
4077         init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4078         init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4079       }
4080       break;
4081
4082     default:
4083       gcc_unreachable ();
4084     }
4085
4086   if (stmts)
4087     gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4088   return init_def;
4089 }
4090
4091 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4092    NUMBER_OF_VECTORS is the number of vector defs to create.
4093    If NEUTRAL_OP is nonnull, introducing extra elements of that
4094    value will not change the result.  */
4095
4096 static void
4097 get_initial_defs_for_reduction (slp_tree slp_node,
4098                                 vec<tree> *vec_oprnds,
4099                                 unsigned int number_of_vectors,
4100                                 bool reduc_chain, tree neutral_op)
4101 {
4102   vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4103   stmt_vec_info stmt_vinfo = stmts[0];
4104   unsigned HOST_WIDE_INT nunits;
4105   unsigned j, number_of_places_left_in_vector;
4106   tree vector_type;
4107   tree vop;
4108   int group_size = stmts.length ();
4109   unsigned int vec_num, i;
4110   unsigned number_of_copies = 1;
4111   vec<tree> voprnds;
4112   voprnds.create (number_of_vectors);
4113   struct loop *loop;
4114   auto_vec<tree, 16> permute_results;
4115
4116   vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4117
4118   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4119
4120   loop = (gimple_bb (stmt_vinfo->stmt))->loop_father;
4121   gcc_assert (loop);
4122   edge pe = loop_preheader_edge (loop);
4123
4124   gcc_assert (!reduc_chain || neutral_op);
4125
4126   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4127      created vectors. It is greater than 1 if unrolling is performed.
4128
4129      For example, we have two scalar operands, s1 and s2 (e.g., group of
4130      strided accesses of size two), while NUNITS is four (i.e., four scalars
4131      of this type can be packed in a vector).  The output vector will contain
4132      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
4133      will be 2).
4134
4135      If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4136      vectors containing the operands.
4137
4138      For example, NUNITS is four as before, and the group size is 8
4139      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
4140      {s5, s6, s7, s8}.  */
4141
4142   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4143     nunits = group_size;
4144
4145   number_of_copies = nunits * number_of_vectors / group_size;
4146
4147   number_of_places_left_in_vector = nunits;
4148   bool constant_p = true;
4149   tree_vector_builder elts (vector_type, nunits, 1);
4150   elts.quick_grow (nunits);
4151   for (j = 0; j < number_of_copies; j++)
4152     {
4153       for (i = group_size - 1; stmts.iterate (i, &stmt_vinfo); i--)
4154         {
4155           tree op;
4156           /* Get the def before the loop.  In reduction chain we have only
4157              one initial value.  */
4158           if ((j != (number_of_copies - 1)
4159                || (reduc_chain && i != 0))
4160               && neutral_op)
4161             op = neutral_op;
4162           else
4163             op = PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4164
4165           /* Create 'vect_ = {op0,op1,...,opn}'.  */
4166           number_of_places_left_in_vector--;
4167           elts[number_of_places_left_in_vector] = op;
4168           if (!CONSTANT_CLASS_P (op))
4169             constant_p = false;
4170
4171           if (number_of_places_left_in_vector == 0)
4172             {
4173               gimple_seq ctor_seq = NULL;
4174               tree init;
4175               if (constant_p && !neutral_op
4176                   ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4177                   : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4178                 /* Build the vector directly from ELTS.  */
4179                 init = gimple_build_vector (&ctor_seq, &elts);
4180               else if (neutral_op)
4181                 {
4182                   /* Build a vector of the neutral value and shift the
4183                      other elements into place.  */
4184                   init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4185                                                        neutral_op);
4186                   int k = nunits;
4187                   while (k > 0 && elts[k - 1] == neutral_op)
4188                     k -= 1;
4189                   while (k > 0)
4190                     {
4191                       k -= 1;
4192                       init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4193                                            vector_type, init, elts[k]);
4194                     }
4195                 }
4196               else
4197                 {
4198                   /* First time round, duplicate ELTS to fill the
4199                      required number of vectors, then cherry pick the
4200                      appropriate result for each iteration.  */
4201                   if (vec_oprnds->is_empty ())
4202                     duplicate_and_interleave (&ctor_seq, vector_type, elts,
4203                                               number_of_vectors,
4204                                               permute_results);
4205                   init = permute_results[number_of_vectors - j - 1];
4206                 }
4207               if (ctor_seq != NULL)
4208                 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4209               voprnds.quick_push (init);
4210
4211               number_of_places_left_in_vector = nunits;
4212               elts.new_vector (vector_type, nunits, 1);
4213               elts.quick_grow (nunits);
4214               constant_p = true;
4215             }
4216         }
4217     }
4218
4219   /* Since the vectors are created in the reverse order, we should invert
4220      them.  */
4221   vec_num = voprnds.length ();
4222   for (j = vec_num; j != 0; j--)
4223     {
4224       vop = voprnds[j - 1];
4225       vec_oprnds->quick_push (vop);
4226     }
4227
4228   voprnds.release ();
4229
4230   /* In case that VF is greater than the unrolling factor needed for the SLP
4231      group of stmts, NUMBER_OF_VECTORS to be created is greater than
4232      NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
4233      to replicate the vectors.  */
4234   tree neutral_vec = NULL;
4235   while (number_of_vectors > vec_oprnds->length ())
4236     {
4237       if (neutral_op)
4238         {
4239           if (!neutral_vec)
4240             {
4241               gimple_seq ctor_seq = NULL;
4242               neutral_vec = gimple_build_vector_from_val
4243                 (&ctor_seq, vector_type, neutral_op);
4244               if (ctor_seq != NULL)
4245                 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4246             }
4247           vec_oprnds->quick_push (neutral_vec);
4248         }
4249       else
4250         {
4251           for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
4252             vec_oprnds->quick_push (vop);
4253         }
4254     }
4255 }
4256
4257
4258 /* Function vect_create_epilog_for_reduction
4259
4260    Create code at the loop-epilog to finalize the result of a reduction
4261    computation.
4262
4263    VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4264      reduction statements.
4265    STMT_INFO is the scalar reduction stmt that is being vectorized.
4266    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4267      number of elements that we can fit in a vectype (nunits).  In this case
4268      we have to generate more than one vector stmt - i.e - we need to "unroll"
4269      the vector stmt by a factor VF/nunits.  For more details see documentation
4270      in vectorizable_operation.
4271    REDUC_FN is the internal function for the epilog reduction.
4272    REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4273      computation.
4274    REDUC_INDEX is the index of the operand in the right hand side of the
4275      statement that is defined by REDUCTION_PHI.
4276    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4277    SLP_NODE is an SLP node containing a group of reduction statements. The
4278      first one in this group is STMT_INFO.
4279    INDUC_VAL is for INTEGER_INDUC_COND_REDUCTION the value to use for the case
4280      when the COND_EXPR is never true in the loop.  For MAX_EXPR, it needs to
4281      be smaller than any value of the IV in the loop, for MIN_EXPR larger than
4282      any value of the IV in the loop.
4283    INDUC_CODE is the code for epilog reduction if INTEGER_INDUC_COND_REDUCTION.
4284    NEUTRAL_OP is the value given by neutral_op_for_slp_reduction; it is
4285      null if this is not an SLP reduction
4286
4287    This function:
4288    1. Creates the reduction def-use cycles: sets the arguments for
4289       REDUCTION_PHIS:
4290       The loop-entry argument is the vectorized initial-value of the reduction.
4291       The loop-latch argument is taken from VECT_DEFS - the vector of partial
4292       sums.
4293    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4294       by calling the function specified by REDUC_FN if available, or by
4295       other means (whole-vector shifts or a scalar loop).
4296       The function also creates a new phi node at the loop exit to preserve
4297       loop-closed form, as illustrated below.
4298
4299      The flow at the entry to this function:
4300
4301         loop:
4302           vec_def = phi <null, null>            # REDUCTION_PHI
4303           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
4304           s_loop = scalar_stmt                  # (scalar) STMT_INFO
4305         loop_exit:
4306           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4307           use <s_out0>
4308           use <s_out0>
4309
4310      The above is transformed by this function into:
4311
4312         loop:
4313           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4314           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
4315           s_loop = scalar_stmt                  # (scalar) STMT_INFO
4316         loop_exit:
4317           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4318           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4319           v_out2 = reduce <v_out1>
4320           s_out3 = extract_field <v_out2, 0>
4321           s_out4 = adjust_result <s_out3>
4322           use <s_out4>
4323           use <s_out4>
4324 */
4325
4326 static void
4327 vect_create_epilog_for_reduction (vec<tree> vect_defs,
4328                                   stmt_vec_info stmt_info,
4329                                   gimple *reduc_def_stmt,
4330                                   int ncopies, internal_fn reduc_fn,
4331                                   vec<stmt_vec_info> reduction_phis,
4332                                   bool double_reduc,
4333                                   slp_tree slp_node,
4334                                   slp_instance slp_node_instance,
4335                                   tree induc_val, enum tree_code induc_code,
4336                                   tree neutral_op)
4337 {
4338   stmt_vec_info prev_phi_info;
4339   tree vectype;
4340   machine_mode mode;
4341   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4342   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4343   basic_block exit_bb;
4344   tree scalar_dest;
4345   tree scalar_type;
4346   gimple *new_phi = NULL, *phi;
4347   stmt_vec_info phi_info;
4348   gimple_stmt_iterator exit_gsi;
4349   tree vec_dest;
4350   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4351   gimple *epilog_stmt = NULL;
4352   enum tree_code code = gimple_assign_rhs_code (stmt_info->stmt);
4353   gimple *exit_phi;
4354   tree bitsize;
4355   tree adjustment_def = NULL;
4356   tree vec_initial_def = NULL;
4357   tree expr, def, initial_def = NULL;
4358   tree orig_name, scalar_result;
4359   imm_use_iterator imm_iter, phi_imm_iter;
4360   use_operand_p use_p, phi_use_p;
4361   gimple *use_stmt;
4362   stmt_vec_info reduction_phi_info = NULL;
4363   bool nested_in_vect_loop = false;
4364   auto_vec<gimple *> new_phis;
4365   auto_vec<stmt_vec_info> inner_phis;
4366   int j, i;
4367   auto_vec<tree> scalar_results;
4368   unsigned int group_size = 1, k, ratio;
4369   auto_vec<tree> vec_initial_defs;
4370   auto_vec<gimple *> phis;
4371   bool slp_reduc = false;
4372   bool direct_slp_reduc;
4373   tree new_phi_result;
4374   stmt_vec_info inner_phi = NULL;
4375   tree induction_index = NULL_TREE;
4376
4377   if (slp_node)
4378     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4379
4380   if (nested_in_vect_loop_p (loop, stmt_info))
4381     {
4382       outer_loop = loop;
4383       loop = loop->inner;
4384       nested_in_vect_loop = true;
4385       gcc_assert (!slp_node);
4386     }
4387
4388   vectype = STMT_VINFO_VECTYPE (stmt_info);
4389   gcc_assert (vectype);
4390   mode = TYPE_MODE (vectype);
4391
4392   /* 1. Create the reduction def-use cycle:
4393      Set the arguments of REDUCTION_PHIS, i.e., transform
4394
4395         loop:
4396           vec_def = phi <null, null>            # REDUCTION_PHI
4397           VECT_DEF = vector_stmt                # vectorized form of STMT
4398           ...
4399
4400      into:
4401
4402         loop:
4403           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4404           VECT_DEF = vector_stmt                # vectorized form of STMT
4405           ...
4406
4407      (in case of SLP, do it for all the phis). */
4408
4409   /* Get the loop-entry arguments.  */
4410   enum vect_def_type initial_def_dt = vect_unknown_def_type;
4411   if (slp_node)
4412     {
4413       unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4414       vec_initial_defs.reserve (vec_num);
4415       get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
4416                                       &vec_initial_defs, vec_num,
4417                                       REDUC_GROUP_FIRST_ELEMENT (stmt_info),
4418                                       neutral_op);
4419     }
4420   else
4421     {
4422       /* Get at the scalar def before the loop, that defines the initial value
4423          of the reduction variable.  */
4424       initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4425                                            loop_preheader_edge (loop));
4426       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
4427          and we can't use zero for induc_val, use initial_def.  Similarly
4428          for REDUC_MIN and initial_def larger than the base.  */
4429       if (TREE_CODE (initial_def) == INTEGER_CST
4430           && (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4431               == INTEGER_INDUC_COND_REDUCTION)
4432           && !integer_zerop (induc_val)
4433           && ((induc_code == MAX_EXPR
4434                && tree_int_cst_lt (initial_def, induc_val))
4435               || (induc_code == MIN_EXPR
4436                   && tree_int_cst_lt (induc_val, initial_def))))
4437         induc_val = initial_def;
4438
4439       if (double_reduc)
4440         /* In case of double reduction we only create a vector variable
4441            to be put in the reduction phi node.  The actual statement
4442            creation is done later in this function.  */
4443         vec_initial_def = vect_create_destination_var (initial_def, vectype);
4444       else if (nested_in_vect_loop)
4445         {
4446           /* Do not use an adjustment def as that case is not supported
4447              correctly if ncopies is not one.  */
4448           vect_is_simple_use (initial_def, loop_vinfo, &initial_def_dt);
4449           vec_initial_def = vect_get_vec_def_for_operand (initial_def,
4450                                                           stmt_info);
4451         }
4452       else
4453         vec_initial_def
4454           = get_initial_def_for_reduction (stmt_info, initial_def,
4455                                            &adjustment_def);
4456       vec_initial_defs.create (1);
4457       vec_initial_defs.quick_push (vec_initial_def);
4458     }
4459
4460   /* Set phi nodes arguments.  */
4461   FOR_EACH_VEC_ELT (reduction_phis, i, phi_info)
4462     {
4463       tree vec_init_def = vec_initial_defs[i];
4464       tree def = vect_defs[i];
4465       for (j = 0; j < ncopies; j++)
4466         {
4467           if (j != 0)
4468             {
4469               phi_info = STMT_VINFO_RELATED_STMT (phi_info);
4470               if (nested_in_vect_loop)
4471                 vec_init_def
4472                   = vect_get_vec_def_for_stmt_copy (loop_vinfo, vec_init_def);
4473             }
4474
4475           /* Set the loop-entry arg of the reduction-phi.  */
4476
4477           gphi *phi = as_a <gphi *> (phi_info->stmt);
4478           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4479               == INTEGER_INDUC_COND_REDUCTION)
4480             {
4481               /* Initialise the reduction phi to zero.  This prevents initial
4482                  values of non-zero interferring with the reduction op.  */
4483               gcc_assert (ncopies == 1);
4484               gcc_assert (i == 0);
4485
4486               tree vec_init_def_type = TREE_TYPE (vec_init_def);
4487               tree induc_val_vec
4488                 = build_vector_from_val (vec_init_def_type, induc_val);
4489
4490               add_phi_arg (phi, induc_val_vec, loop_preheader_edge (loop),
4491                            UNKNOWN_LOCATION);
4492             }
4493           else
4494             add_phi_arg (phi, vec_init_def, loop_preheader_edge (loop),
4495                          UNKNOWN_LOCATION);
4496
4497           /* Set the loop-latch arg for the reduction-phi.  */
4498           if (j > 0)
4499             def = vect_get_vec_def_for_stmt_copy (loop_vinfo, def);
4500
4501           add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
4502
4503           if (dump_enabled_p ())
4504             dump_printf_loc (MSG_NOTE, vect_location,
4505                              "transform reduction: created def-use cycle: %G%G",
4506                              phi, SSA_NAME_DEF_STMT (def));
4507         }
4508     }
4509
4510   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4511      which is updated with the current index of the loop for every match of
4512      the original loop's cond_expr (VEC_STMT).  This results in a vector
4513      containing the last time the condition passed for that vector lane.
4514      The first match will be a 1 to allow 0 to be used for non-matching
4515      indexes.  If there are no matches at all then the vector will be all
4516      zeroes.  */
4517   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4518     {
4519       tree indx_before_incr, indx_after_incr;
4520       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4521
4522       gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info)->stmt;
4523       gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4524
4525       int scalar_precision
4526         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4527       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4528       tree cr_index_vector_type = build_vector_type
4529         (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype));
4530
4531       /* First we create a simple vector induction variable which starts
4532          with the values {1,2,3,...} (SERIES_VECT) and increments by the
4533          vector size (STEP).  */
4534
4535       /* Create a {1,2,3,...} vector.  */
4536       tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
4537
4538       /* Create a vector of the step value.  */
4539       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4540       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4541
4542       /* Create an induction variable.  */
4543       gimple_stmt_iterator incr_gsi;
4544       bool insert_after;
4545       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4546       create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4547                  insert_after, &indx_before_incr, &indx_after_incr);
4548
4549       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4550          filled with zeros (VEC_ZERO).  */
4551
4552       /* Create a vector of 0s.  */
4553       tree zero = build_zero_cst (cr_index_scalar_type);
4554       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4555
4556       /* Create a vector phi node.  */
4557       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4558       new_phi = create_phi_node (new_phi_tree, loop->header);
4559       loop_vinfo->add_stmt (new_phi);
4560       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4561                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
4562
4563       /* Now take the condition from the loops original cond_expr
4564          (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
4565          every match uses values from the induction variable
4566          (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4567          (NEW_PHI_TREE).
4568          Finally, we update the phi (NEW_PHI_TREE) to take the value of
4569          the new cond_expr (INDEX_COND_EXPR).  */
4570
4571       /* Duplicate the condition from vec_stmt.  */
4572       tree ccompare = unshare_expr (gimple_assign_rhs1 (vec_stmt));
4573
4574       /* Create a conditional, where the condition is taken from vec_stmt
4575          (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and
4576          else is the phi (NEW_PHI_TREE).  */
4577       tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
4578                                      ccompare, indx_before_incr,
4579                                      new_phi_tree);
4580       induction_index = make_ssa_name (cr_index_vector_type);
4581       gimple *index_condition = gimple_build_assign (induction_index,
4582                                                      index_cond_expr);
4583       gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
4584       stmt_vec_info index_vec_info = loop_vinfo->add_stmt (index_condition);
4585       STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
4586
4587       /* Update the phi with the vec cond.  */
4588       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4589                    loop_latch_edge (loop), UNKNOWN_LOCATION);
4590     }
4591
4592   /* 2. Create epilog code.
4593         The reduction epilog code operates across the elements of the vector
4594         of partial results computed by the vectorized loop.
4595         The reduction epilog code consists of:
4596
4597         step 1: compute the scalar result in a vector (v_out2)
4598         step 2: extract the scalar result (s_out3) from the vector (v_out2)
4599         step 3: adjust the scalar result (s_out3) if needed.
4600
4601         Step 1 can be accomplished using one the following three schemes:
4602           (scheme 1) using reduc_fn, if available.
4603           (scheme 2) using whole-vector shifts, if available.
4604           (scheme 3) using a scalar loop. In this case steps 1+2 above are
4605                      combined.
4606
4607           The overall epilog code looks like this:
4608
4609           s_out0 = phi <s_loop>         # original EXIT_PHI
4610           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
4611           v_out2 = reduce <v_out1>              # step 1
4612           s_out3 = extract_field <v_out2, 0>    # step 2
4613           s_out4 = adjust_result <s_out3>       # step 3
4614
4615           (step 3 is optional, and steps 1 and 2 may be combined).
4616           Lastly, the uses of s_out0 are replaced by s_out4.  */
4617
4618
4619   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4620          v_out1 = phi <VECT_DEF>
4621          Store them in NEW_PHIS.  */
4622
4623   exit_bb = single_exit (loop)->dest;
4624   prev_phi_info = NULL;
4625   new_phis.create (vect_defs.length ());
4626   FOR_EACH_VEC_ELT (vect_defs, i, def)
4627     {
4628       for (j = 0; j < ncopies; j++)
4629         {
4630           tree new_def = copy_ssa_name (def);
4631           phi = create_phi_node (new_def, exit_bb);
4632           stmt_vec_info phi_info = loop_vinfo->add_stmt (phi);
4633           if (j == 0)
4634             new_phis.quick_push (phi);
4635           else
4636             {
4637               def = vect_get_vec_def_for_stmt_copy (loop_vinfo, def);
4638               STMT_VINFO_RELATED_STMT (prev_phi_info) = phi_info;
4639             }
4640
4641           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4642           prev_phi_info = phi_info;
4643         }
4644     }
4645
4646   /* The epilogue is created for the outer-loop, i.e., for the loop being
4647      vectorized.  Create exit phis for the outer loop.  */
4648   if (double_reduc)
4649     {
4650       loop = outer_loop;
4651       exit_bb = single_exit (loop)->dest;
4652       inner_phis.create (vect_defs.length ());
4653       FOR_EACH_VEC_ELT (new_phis, i, phi)
4654         {
4655           stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
4656           tree new_result = copy_ssa_name (PHI_RESULT (phi));
4657           gphi *outer_phi = create_phi_node (new_result, exit_bb);
4658           SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4659                            PHI_RESULT (phi));
4660           prev_phi_info = loop_vinfo->add_stmt (outer_phi);
4661           inner_phis.quick_push (phi_info);
4662           new_phis[i] = outer_phi;
4663           while (STMT_VINFO_RELATED_STMT (phi_info))
4664             {
4665               phi_info = STMT_VINFO_RELATED_STMT (phi_info);
4666               new_result = copy_ssa_name (PHI_RESULT (phi_info->stmt));
4667               outer_phi = create_phi_node (new_result, exit_bb);
4668               SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4669                                PHI_RESULT (phi_info->stmt));
4670               stmt_vec_info outer_phi_info = loop_vinfo->add_stmt (outer_phi);
4671               STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi_info;
4672               prev_phi_info = outer_phi_info;
4673             }
4674         }
4675     }
4676
4677   exit_gsi = gsi_after_labels (exit_bb);
4678
4679   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4680          (i.e. when reduc_fn is not available) and in the final adjustment
4681          code (if needed).  Also get the original scalar reduction variable as
4682          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
4683          represents a reduction pattern), the tree-code and scalar-def are
4684          taken from the original stmt that the pattern-stmt (STMT) replaces.
4685          Otherwise (it is a regular reduction) - the tree-code and scalar-def
4686          are taken from STMT.  */
4687
4688   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4689   if (orig_stmt_info != stmt_info)
4690     {
4691       /* Reduction pattern  */
4692       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4693       gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
4694     }
4695
4696   code = gimple_assign_rhs_code (orig_stmt_info->stmt);
4697   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4698      partial results are added and not subtracted.  */
4699   if (code == MINUS_EXPR)
4700     code = PLUS_EXPR;
4701
4702   scalar_dest = gimple_assign_lhs (orig_stmt_info->stmt);
4703   scalar_type = TREE_TYPE (scalar_dest);
4704   scalar_results.create (group_size);
4705   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4706   bitsize = TYPE_SIZE (scalar_type);
4707
4708   /* In case this is a reduction in an inner-loop while vectorizing an outer
4709      loop - we don't need to extract a single scalar result at the end of the
4710      inner-loop (unless it is double reduction, i.e., the use of reduction is
4711      outside the outer-loop).  The final vector of partial results will be used
4712      in the vectorized outer-loop, or reduced to a scalar result at the end of
4713      the outer-loop.  */
4714   if (nested_in_vect_loop && !double_reduc)
4715     goto vect_finalize_reduction;
4716
4717   /* SLP reduction without reduction chain, e.g.,
4718      # a1 = phi <a2, a0>
4719      # b1 = phi <b2, b0>
4720      a2 = operation (a1)
4721      b2 = operation (b1)  */
4722   slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
4723
4724   /* True if we should implement SLP_REDUC using native reduction operations
4725      instead of scalar operations.  */
4726   direct_slp_reduc = (reduc_fn != IFN_LAST
4727                       && slp_reduc
4728                       && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
4729
4730   /* In case of reduction chain, e.g.,
4731      # a1 = phi <a3, a0>
4732      a2 = operation (a1)
4733      a3 = operation (a2),
4734
4735      we may end up with more than one vector result.  Here we reduce them to
4736      one vector.  */
4737   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) || direct_slp_reduc)
4738     {
4739       tree first_vect = PHI_RESULT (new_phis[0]);
4740       gassign *new_vec_stmt = NULL;
4741       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4742       for (k = 1; k < new_phis.length (); k++)
4743         {
4744           gimple *next_phi = new_phis[k];
4745           tree second_vect = PHI_RESULT (next_phi);
4746           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4747           new_vec_stmt = gimple_build_assign (tem, code,
4748                                               first_vect, second_vect);
4749           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4750           first_vect = tem;
4751         }
4752
4753       new_phi_result = first_vect;
4754       if (new_vec_stmt)
4755         {
4756           new_phis.truncate (0);
4757           new_phis.safe_push (new_vec_stmt);
4758         }
4759     }
4760   /* Likewise if we couldn't use a single defuse cycle.  */
4761   else if (ncopies > 1)
4762     {
4763       gcc_assert (new_phis.length () == 1);
4764       tree first_vect = PHI_RESULT (new_phis[0]);
4765       gassign *new_vec_stmt = NULL;
4766       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4767       stmt_vec_info next_phi_info = loop_vinfo->lookup_stmt (new_phis[0]);
4768       for (int k = 1; k < ncopies; ++k)
4769         {
4770           next_phi_info = STMT_VINFO_RELATED_STMT (next_phi_info);
4771           tree second_vect = PHI_RESULT (next_phi_info->stmt);
4772           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4773           new_vec_stmt = gimple_build_assign (tem, code,
4774                                               first_vect, second_vect);
4775           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4776           first_vect = tem;
4777         }
4778       new_phi_result = first_vect;
4779       new_phis.truncate (0);
4780       new_phis.safe_push (new_vec_stmt);
4781     }
4782   else
4783     new_phi_result = PHI_RESULT (new_phis[0]);
4784
4785   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4786       && reduc_fn != IFN_LAST)
4787     {
4788       /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4789          various data values where the condition matched and another vector
4790          (INDUCTION_INDEX) containing all the indexes of those matches.  We
4791          need to extract the last matching index (which will be the index with
4792          highest value) and use this to index into the data vector.
4793          For the case where there were no matches, the data vector will contain
4794          all default values and the index vector will be all zeros.  */
4795
4796       /* Get various versions of the type of the vector of indexes.  */
4797       tree index_vec_type = TREE_TYPE (induction_index);
4798       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4799       tree index_scalar_type = TREE_TYPE (index_vec_type);
4800       tree index_vec_cmp_type = build_same_sized_truth_vector_type
4801         (index_vec_type);
4802
4803       /* Get an unsigned integer version of the type of the data vector.  */
4804       int scalar_precision
4805         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
4806       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4807       tree vectype_unsigned = build_vector_type
4808         (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4809
4810       /* First we need to create a vector (ZERO_VEC) of zeros and another
4811          vector (MAX_INDEX_VEC) filled with the last matching index, which we
4812          can create using a MAX reduction and then expanding.
4813          In the case where the loop never made any matches, the max index will
4814          be zero.  */
4815
4816       /* Vector of {0, 0, 0,...}.  */
4817       tree zero_vec = make_ssa_name (vectype);
4818       tree zero_vec_rhs = build_zero_cst (vectype);
4819       gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
4820       gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
4821
4822       /* Find maximum value from the vector of found indexes.  */
4823       tree max_index = make_ssa_name (index_scalar_type);
4824       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4825                                                           1, induction_index);
4826       gimple_call_set_lhs (max_index_stmt, max_index);
4827       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
4828
4829       /* Vector of {max_index, max_index, max_index,...}.  */
4830       tree max_index_vec = make_ssa_name (index_vec_type);
4831       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
4832                                                       max_index);
4833       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
4834                                                         max_index_vec_rhs);
4835       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
4836
4837       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
4838          with the vector (INDUCTION_INDEX) of found indexes, choosing values
4839          from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
4840          otherwise.  Only one value should match, resulting in a vector
4841          (VEC_COND) with one data value and the rest zeros.
4842          In the case where the loop never made any matches, every index will
4843          match, resulting in a vector with all data values (which will all be
4844          the default value).  */
4845
4846       /* Compare the max index vector to the vector of found indexes to find
4847          the position of the max value.  */
4848       tree vec_compare = make_ssa_name (index_vec_cmp_type);
4849       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
4850                                                       induction_index,
4851                                                       max_index_vec);
4852       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
4853
4854       /* Use the compare to choose either values from the data vector or
4855          zero.  */
4856       tree vec_cond = make_ssa_name (vectype);
4857       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
4858                                                    vec_compare, new_phi_result,
4859                                                    zero_vec);
4860       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
4861
4862       /* Finally we need to extract the data value from the vector (VEC_COND)
4863          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
4864          reduction, but because this doesn't exist, we can use a MAX reduction
4865          instead.  The data value might be signed or a float so we need to cast
4866          it first.
4867          In the case where the loop never made any matches, the data values are
4868          all identical, and so will reduce down correctly.  */
4869
4870       /* Make the matched data values unsigned.  */
4871       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
4872       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
4873                                        vec_cond);
4874       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
4875                                                         VIEW_CONVERT_EXPR,
4876                                                         vec_cond_cast_rhs);
4877       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
4878
4879       /* Reduce down to a scalar value.  */
4880       tree data_reduc = make_ssa_name (scalar_type_unsigned);
4881       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4882                                                            1, vec_cond_cast);
4883       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
4884       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
4885
4886       /* Convert the reduced value back to the result type and set as the
4887          result.  */
4888       gimple_seq stmts = NULL;
4889       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
4890                                data_reduc);
4891       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4892       scalar_results.safe_push (new_temp);
4893     }
4894   else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4895            && reduc_fn == IFN_LAST)
4896     {
4897       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
4898          idx = 0;
4899          idx_val = induction_index[0];
4900          val = data_reduc[0];
4901          for (idx = 0, val = init, i = 0; i < nelts; ++i)
4902            if (induction_index[i] > idx_val)
4903              val = data_reduc[i], idx_val = induction_index[i];
4904          return val;  */
4905
4906       tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
4907       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
4908       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
4909       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
4910       /* Enforced by vectorizable_reduction, which ensures we have target
4911          support before allowing a conditional reduction on variable-length
4912          vectors.  */
4913       unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
4914       tree idx_val = NULL_TREE, val = NULL_TREE;
4915       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
4916         {
4917           tree old_idx_val = idx_val;
4918           tree old_val = val;
4919           idx_val = make_ssa_name (idx_eltype);
4920           epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
4921                                              build3 (BIT_FIELD_REF, idx_eltype,
4922                                                      induction_index,
4923                                                      bitsize_int (el_size),
4924                                                      bitsize_int (off)));
4925           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4926           val = make_ssa_name (data_eltype);
4927           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
4928                                              build3 (BIT_FIELD_REF,
4929                                                      data_eltype,
4930                                                      new_phi_result,
4931                                                      bitsize_int (el_size),
4932                                                      bitsize_int (off)));
4933           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4934           if (off != 0)
4935             {
4936               tree new_idx_val = idx_val;
4937               tree new_val = val;
4938               if (off != v_size - el_size)
4939                 {
4940                   new_idx_val = make_ssa_name (idx_eltype);
4941                   epilog_stmt = gimple_build_assign (new_idx_val,
4942                                                      MAX_EXPR, idx_val,
4943                                                      old_idx_val);
4944                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4945                 }
4946               new_val = make_ssa_name (data_eltype);
4947               epilog_stmt = gimple_build_assign (new_val,
4948                                                  COND_EXPR,
4949                                                  build2 (GT_EXPR,
4950                                                          boolean_type_node,
4951                                                          idx_val,
4952                                                          old_idx_val),
4953                                                  val, old_val);
4954               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4955               idx_val = new_idx_val;
4956               val = new_val;
4957             }
4958         }
4959       /* Convert the reduced value back to the result type and set as the
4960          result.  */
4961       gimple_seq stmts = NULL;
4962       val = gimple_convert (&stmts, scalar_type, val);
4963       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4964       scalar_results.safe_push (val);
4965     }
4966
4967   /* 2.3 Create the reduction code, using one of the three schemes described
4968          above. In SLP we simply need to extract all the elements from the
4969          vector (without reducing them), so we use scalar shifts.  */
4970   else if (reduc_fn != IFN_LAST && !slp_reduc)
4971     {
4972       tree tmp;
4973       tree vec_elem_type;
4974
4975       /* Case 1:  Create:
4976          v_out2 = reduc_expr <v_out1>  */
4977
4978       if (dump_enabled_p ())
4979         dump_printf_loc (MSG_NOTE, vect_location,
4980                          "Reduce using direct vector reduction.\n");
4981
4982       vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
4983       if (!useless_type_conversion_p (scalar_type, vec_elem_type))
4984         {
4985           tree tmp_dest
4986             = vect_create_destination_var (scalar_dest, vec_elem_type);
4987           epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
4988                                                     new_phi_result);
4989           gimple_set_lhs (epilog_stmt, tmp_dest);
4990           new_temp = make_ssa_name (tmp_dest, epilog_stmt);
4991           gimple_set_lhs (epilog_stmt, new_temp);
4992           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4993
4994           epilog_stmt = gimple_build_assign (new_scalar_dest, NOP_EXPR,
4995                                              new_temp);
4996         }
4997       else
4998         {
4999           epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5000                                                     new_phi_result);
5001           gimple_set_lhs (epilog_stmt, new_scalar_dest);
5002         }
5003
5004       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5005       gimple_set_lhs (epilog_stmt, new_temp);
5006       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5007
5008       if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5009            == INTEGER_INDUC_COND_REDUCTION)
5010           && !operand_equal_p (initial_def, induc_val, 0))
5011         {
5012           /* Earlier we set the initial value to be a vector if induc_val
5013              values.  Check the result and if it is induc_val then replace
5014              with the original initial value, unless induc_val is
5015              the same as initial_def already.  */
5016           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5017                                   induc_val);
5018
5019           tmp = make_ssa_name (new_scalar_dest);
5020           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5021                                              initial_def, new_temp);
5022           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5023           new_temp = tmp;
5024         }
5025
5026       scalar_results.safe_push (new_temp);
5027     }
5028   else if (direct_slp_reduc)
5029     {
5030       /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5031          with the elements for other SLP statements replaced with the
5032          neutral value.  We can then do a normal reduction on each vector.  */
5033
5034       /* Enforced by vectorizable_reduction.  */
5035       gcc_assert (new_phis.length () == 1);
5036       gcc_assert (pow2p_hwi (group_size));
5037
5038       slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
5039       vec<stmt_vec_info> orig_phis
5040         = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
5041       gimple_seq seq = NULL;
5042
5043       /* Build a vector {0, 1, 2, ...}, with the same number of elements
5044          and the same element size as VECTYPE.  */
5045       tree index = build_index_vector (vectype, 0, 1);
5046       tree index_type = TREE_TYPE (index);
5047       tree index_elt_type = TREE_TYPE (index_type);
5048       tree mask_type = build_same_sized_truth_vector_type (index_type);
5049
5050       /* Create a vector that, for each element, identifies which of
5051          the REDUC_GROUP_SIZE results should use it.  */
5052       tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5053       index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5054                             build_vector_from_val (index_type, index_mask));
5055
5056       /* Get a neutral vector value.  This is simply a splat of the neutral
5057          scalar value if we have one, otherwise the initial scalar value
5058          is itself a neutral value.  */
5059       tree vector_identity = NULL_TREE;
5060       if (neutral_op)
5061         vector_identity = gimple_build_vector_from_val (&seq, vectype,
5062                                                         neutral_op);
5063       for (unsigned int i = 0; i < group_size; ++i)
5064         {
5065           /* If there's no univeral neutral value, we can use the
5066              initial scalar value from the original PHI.  This is used
5067              for MIN and MAX reduction, for example.  */
5068           if (!neutral_op)
5069             {
5070               tree scalar_value
5071                 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i]->stmt,
5072                                          loop_preheader_edge (loop));
5073               vector_identity = gimple_build_vector_from_val (&seq, vectype,
5074                                                               scalar_value);
5075             }
5076
5077           /* Calculate the equivalent of:
5078
5079              sel[j] = (index[j] == i);
5080
5081              which selects the elements of NEW_PHI_RESULT that should
5082              be included in the result.  */
5083           tree compare_val = build_int_cst (index_elt_type, i);
5084           compare_val = build_vector_from_val (index_type, compare_val);
5085           tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5086                                    index, compare_val);
5087
5088           /* Calculate the equivalent of:
5089
5090              vec = seq ? new_phi_result : vector_identity;
5091
5092              VEC is now suitable for a full vector reduction.  */
5093           tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5094                                    sel, new_phi_result, vector_identity);
5095
5096           /* Do the reduction and convert it to the appropriate type.  */
5097           tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5098                                       TREE_TYPE (vectype), vec);
5099           scalar = gimple_convert (&seq, scalar_type, scalar);
5100           scalar_results.safe_push (scalar);
5101         }
5102       gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5103     }
5104   else
5105     {
5106       bool reduce_with_shift;
5107       tree vec_temp;
5108
5109       /* COND reductions all do the final reduction with MAX_EXPR
5110          or MIN_EXPR.  */
5111       if (code == COND_EXPR)
5112         {
5113           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5114               == INTEGER_INDUC_COND_REDUCTION)
5115             code = induc_code;
5116           else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5117                    == CONST_COND_REDUCTION)
5118             code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
5119           else
5120             code = MAX_EXPR;
5121         }
5122
5123       /* See if the target wants to do the final (shift) reduction
5124          in a vector mode of smaller size and first reduce upper/lower
5125          halves against each other.  */
5126       enum machine_mode mode1 = mode;
5127       tree vectype1 = vectype;
5128       unsigned sz = tree_to_uhwi (TYPE_SIZE_UNIT (vectype));
5129       unsigned sz1 = sz;
5130       if (!slp_reduc
5131           && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5132         sz1 = GET_MODE_SIZE (mode1).to_constant ();
5133
5134       vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz1);
5135       reduce_with_shift = have_whole_vector_shift (mode1);
5136       if (!VECTOR_MODE_P (mode1))
5137         reduce_with_shift = false;
5138       else
5139         {
5140           optab optab = optab_for_tree_code (code, vectype1, optab_default);
5141           if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5142             reduce_with_shift = false;
5143         }
5144
5145       /* First reduce the vector to the desired vector size we should
5146          do shift reduction on by combining upper and lower halves.  */
5147       new_temp = new_phi_result;
5148       while (sz > sz1)
5149         {
5150           gcc_assert (!slp_reduc);
5151           sz /= 2;
5152           vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz);
5153
5154           /* The target has to make sure we support lowpart/highpart
5155              extraction, either via direct vector extract or through
5156              an integer mode punning.  */
5157           tree dst1, dst2;
5158           if (convert_optab_handler (vec_extract_optab,
5159                                      TYPE_MODE (TREE_TYPE (new_temp)),
5160                                      TYPE_MODE (vectype1))
5161               != CODE_FOR_nothing)
5162             {
5163               /* Extract sub-vectors directly once vec_extract becomes
5164                  a conversion optab.  */
5165               dst1 = make_ssa_name (vectype1);
5166               epilog_stmt
5167                   = gimple_build_assign (dst1, BIT_FIELD_REF,
5168                                          build3 (BIT_FIELD_REF, vectype1,
5169                                                  new_temp, TYPE_SIZE (vectype1),
5170                                                  bitsize_int (0)));
5171               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5172               dst2 =  make_ssa_name (vectype1);
5173               epilog_stmt
5174                   = gimple_build_assign (dst2, BIT_FIELD_REF,
5175                                          build3 (BIT_FIELD_REF, vectype1,
5176                                                  new_temp, TYPE_SIZE (vectype1),
5177                                                  bitsize_int (sz * BITS_PER_UNIT)));
5178               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5179             }
5180           else
5181             {
5182               /* Extract via punning to appropriately sized integer mode
5183                  vector.  */
5184               tree eltype = build_nonstandard_integer_type (sz * BITS_PER_UNIT,
5185                                                             1);
5186               tree etype = build_vector_type (eltype, 2);
5187               gcc_assert (convert_optab_handler (vec_extract_optab,
5188                                                  TYPE_MODE (etype),
5189                                                  TYPE_MODE (eltype))
5190                           != CODE_FOR_nothing);
5191               tree tem = make_ssa_name (etype);
5192               epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5193                                                  build1 (VIEW_CONVERT_EXPR,
5194                                                          etype, new_temp));
5195               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5196               new_temp = tem;
5197               tem = make_ssa_name (eltype);
5198               epilog_stmt
5199                   = gimple_build_assign (tem, BIT_FIELD_REF,
5200                                          build3 (BIT_FIELD_REF, eltype,
5201                                                  new_temp, TYPE_SIZE (eltype),
5202                                                  bitsize_int (0)));
5203               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5204               dst1 = make_ssa_name (vectype1);
5205               epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5206                                                  build1 (VIEW_CONVERT_EXPR,
5207                                                          vectype1, tem));
5208               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5209               tem = make_ssa_name (eltype);
5210               epilog_stmt
5211                   = gimple_build_assign (tem, BIT_FIELD_REF,
5212                                          build3 (BIT_FIELD_REF, eltype,
5213                                                  new_temp, TYPE_SIZE (eltype),
5214                                                  bitsize_int (sz * BITS_PER_UNIT)));
5215               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5216               dst2 =  make_ssa_name (vectype1);
5217               epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5218                                                  build1 (VIEW_CONVERT_EXPR,
5219                                                          vectype1, tem));
5220               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5221             }
5222
5223           new_temp = make_ssa_name (vectype1);
5224           epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5225           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5226         }
5227
5228       if (reduce_with_shift && !slp_reduc)
5229         {
5230           int element_bitsize = tree_to_uhwi (bitsize);
5231           /* Enforced by vectorizable_reduction, which disallows SLP reductions
5232              for variable-length vectors and also requires direct target support
5233              for loop reductions.  */
5234           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5235           int nelements = vec_size_in_bits / element_bitsize;
5236           vec_perm_builder sel;
5237           vec_perm_indices indices;
5238
5239           int elt_offset;
5240
5241           tree zero_vec = build_zero_cst (vectype1);
5242           /* Case 2: Create:
5243              for (offset = nelements/2; offset >= 1; offset/=2)
5244                 {
5245                   Create:  va' = vec_shift <va, offset>
5246                   Create:  va = vop <va, va'>
5247                 }  */
5248
5249           tree rhs;
5250
5251           if (dump_enabled_p ())
5252             dump_printf_loc (MSG_NOTE, vect_location,
5253                              "Reduce using vector shifts\n");
5254
5255           mode1 = TYPE_MODE (vectype1);
5256           vec_dest = vect_create_destination_var (scalar_dest, vectype1);
5257           for (elt_offset = nelements / 2;
5258                elt_offset >= 1;
5259                elt_offset /= 2)
5260             {
5261               calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5262               indices.new_vector (sel, 2, nelements);
5263               tree mask = vect_gen_perm_mask_any (vectype1, indices);
5264               epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
5265                                                  new_temp, zero_vec, mask);
5266               new_name = make_ssa_name (vec_dest, epilog_stmt);
5267               gimple_assign_set_lhs (epilog_stmt, new_name);
5268               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5269
5270               epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
5271                                                  new_temp);
5272               new_temp = make_ssa_name (vec_dest, epilog_stmt);
5273               gimple_assign_set_lhs (epilog_stmt, new_temp);
5274               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5275             }
5276
5277           /* 2.4  Extract the final scalar result.  Create:
5278              s_out3 = extract_field <v_out2, bitpos>  */
5279
5280           if (dump_enabled_p ())
5281             dump_printf_loc (MSG_NOTE, vect_location,
5282                              "extract scalar result\n");
5283
5284           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5285                         bitsize, bitsize_zero_node);
5286           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5287           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5288           gimple_assign_set_lhs (epilog_stmt, new_temp);
5289           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5290           scalar_results.safe_push (new_temp);
5291         }
5292       else
5293         {
5294           /* Case 3: Create:
5295              s = extract_field <v_out2, 0>
5296              for (offset = element_size;
5297                   offset < vector_size;
5298                   offset += element_size;)
5299                {
5300                  Create:  s' = extract_field <v_out2, offset>
5301                  Create:  s = op <s, s'>  // For non SLP cases
5302                }  */
5303
5304           if (dump_enabled_p ())
5305             dump_printf_loc (MSG_NOTE, vect_location,
5306                              "Reduce using scalar code.\n");
5307
5308           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5309           int element_bitsize = tree_to_uhwi (bitsize);
5310           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5311             {
5312               int bit_offset;
5313               if (gimple_code (new_phi) == GIMPLE_PHI)
5314                 vec_temp = PHI_RESULT (new_phi);
5315               else
5316                 vec_temp = gimple_assign_lhs (new_phi);
5317               tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
5318                                  bitsize_zero_node);
5319               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5320               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5321               gimple_assign_set_lhs (epilog_stmt, new_temp);
5322               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5323
5324               /* In SLP we don't need to apply reduction operation, so we just
5325                  collect s' values in SCALAR_RESULTS.  */
5326               if (slp_reduc)
5327                 scalar_results.safe_push (new_temp);
5328
5329               for (bit_offset = element_bitsize;
5330                    bit_offset < vec_size_in_bits;
5331                    bit_offset += element_bitsize)
5332                 {
5333                   tree bitpos = bitsize_int (bit_offset);
5334                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
5335                                      bitsize, bitpos);
5336
5337                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5338                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
5339                   gimple_assign_set_lhs (epilog_stmt, new_name);
5340                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5341
5342                   if (slp_reduc)
5343                     {
5344                       /* In SLP we don't need to apply reduction operation, so
5345                          we just collect s' values in SCALAR_RESULTS.  */
5346                       new_temp = new_name;
5347                       scalar_results.safe_push (new_name);
5348                     }
5349                   else
5350                     {
5351                       epilog_stmt = gimple_build_assign (new_scalar_dest, code,
5352                                                          new_name, new_temp);
5353                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5354                       gimple_assign_set_lhs (epilog_stmt, new_temp);
5355                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5356                     }
5357                 }
5358             }
5359
5360           /* The only case where we need to reduce scalar results in SLP, is
5361              unrolling.  If the size of SCALAR_RESULTS is greater than
5362              REDUC_GROUP_SIZE, we reduce them combining elements modulo
5363              REDUC_GROUP_SIZE.  */
5364           if (slp_reduc)
5365             {
5366               tree res, first_res, new_res;
5367               gimple *new_stmt;
5368
5369               /* Reduce multiple scalar results in case of SLP unrolling.  */
5370               for (j = group_size; scalar_results.iterate (j, &res);
5371                    j++)
5372                 {
5373                   first_res = scalar_results[j % group_size];
5374                   new_stmt = gimple_build_assign (new_scalar_dest, code,
5375                                                   first_res, res);
5376                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
5377                   gimple_assign_set_lhs (new_stmt, new_res);
5378                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
5379                   scalar_results[j % group_size] = new_res;
5380                 }
5381             }
5382           else
5383             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
5384             scalar_results.safe_push (new_temp);
5385         }
5386
5387       if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5388            == INTEGER_INDUC_COND_REDUCTION)
5389           && !operand_equal_p (initial_def, induc_val, 0))
5390         {
5391           /* Earlier we set the initial value to be a vector if induc_val
5392              values.  Check the result and if it is induc_val then replace
5393              with the original initial value, unless induc_val is
5394              the same as initial_def already.  */
5395           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5396                                   induc_val);
5397
5398           tree tmp = make_ssa_name (new_scalar_dest);
5399           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5400                                              initial_def, new_temp);
5401           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5402           scalar_results[0] = tmp;
5403         }
5404     }
5405
5406 vect_finalize_reduction:
5407
5408   if (double_reduc)
5409     loop = loop->inner;
5410
5411   /* 2.5 Adjust the final result by the initial value of the reduction
5412          variable. (When such adjustment is not needed, then
5413          'adjustment_def' is zero).  For example, if code is PLUS we create:
5414          new_temp = loop_exit_def + adjustment_def  */
5415
5416   if (adjustment_def)
5417     {
5418       gcc_assert (!slp_reduc);
5419       if (nested_in_vect_loop)
5420         {
5421           new_phi = new_phis[0];
5422           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
5423           expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
5424           new_dest = vect_create_destination_var (scalar_dest, vectype);
5425         }
5426       else
5427         {
5428           new_temp = scalar_results[0];
5429           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5430           expr = build2 (code, scalar_type, new_temp, adjustment_def);
5431           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
5432         }
5433
5434       epilog_stmt = gimple_build_assign (new_dest, expr);
5435       new_temp = make_ssa_name (new_dest, epilog_stmt);
5436       gimple_assign_set_lhs (epilog_stmt, new_temp);
5437       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5438       if (nested_in_vect_loop)
5439         {
5440           stmt_vec_info epilog_stmt_info = loop_vinfo->add_stmt (epilog_stmt);
5441           STMT_VINFO_RELATED_STMT (epilog_stmt_info)
5442             = STMT_VINFO_RELATED_STMT (loop_vinfo->lookup_stmt (new_phi));
5443
5444           if (!double_reduc)
5445             scalar_results.quick_push (new_temp);
5446           else
5447             scalar_results[0] = new_temp;
5448         }
5449       else
5450         scalar_results[0] = new_temp;
5451
5452       new_phis[0] = epilog_stmt;
5453     }
5454
5455   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
5456           phis with new adjusted scalar results, i.e., replace use <s_out0>
5457           with use <s_out4>.
5458
5459      Transform:
5460         loop_exit:
5461           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5462           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5463           v_out2 = reduce <v_out1>
5464           s_out3 = extract_field <v_out2, 0>
5465           s_out4 = adjust_result <s_out3>
5466           use <s_out0>
5467           use <s_out0>
5468
5469      into:
5470
5471         loop_exit:
5472           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5473           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5474           v_out2 = reduce <v_out1>
5475           s_out3 = extract_field <v_out2, 0>
5476           s_out4 = adjust_result <s_out3>
5477           use <s_out4>
5478           use <s_out4> */
5479
5480
5481   /* In SLP reduction chain we reduce vector results into one vector if
5482      necessary, hence we set here REDUC_GROUP_SIZE to 1.  SCALAR_DEST is the
5483      LHS of the last stmt in the reduction chain, since we are looking for
5484      the loop exit phi node.  */
5485   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5486     {
5487       stmt_vec_info dest_stmt_info
5488         = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
5489       scalar_dest = gimple_assign_lhs (dest_stmt_info->stmt);
5490       group_size = 1;
5491     }
5492
5493   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5494      case that REDUC_GROUP_SIZE is greater than vectorization factor).
5495      Therefore, we need to match SCALAR_RESULTS with corresponding statements.
5496      The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results
5497      correspond to the first vector stmt, etc.
5498      (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)).  */
5499   if (group_size > new_phis.length ())
5500     {
5501       ratio = group_size / new_phis.length ();
5502       gcc_assert (!(group_size % new_phis.length ()));
5503     }
5504   else
5505     ratio = 1;
5506
5507   stmt_vec_info epilog_stmt_info = NULL;
5508   for (k = 0; k < group_size; k++)
5509     {
5510       if (k % ratio == 0)
5511         {
5512           epilog_stmt_info = loop_vinfo->lookup_stmt (new_phis[k / ratio]);
5513           reduction_phi_info = reduction_phis[k / ratio];
5514           if (double_reduc)
5515             inner_phi = inner_phis[k / ratio];
5516         }
5517
5518       if (slp_reduc)
5519         {
5520           stmt_vec_info scalar_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5521
5522           orig_stmt_info = STMT_VINFO_RELATED_STMT (scalar_stmt_info);
5523           /* SLP statements can't participate in patterns.  */
5524           gcc_assert (!orig_stmt_info);
5525           scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt);
5526         }
5527
5528       phis.create (3);
5529       /* Find the loop-closed-use at the loop exit of the original scalar
5530          result.  (The reduction result is expected to have two immediate uses -
5531          one at the latch block, and one at the loop exit).  */
5532       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5533         if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
5534             && !is_gimple_debug (USE_STMT (use_p)))
5535           phis.safe_push (USE_STMT (use_p));
5536
5537       /* While we expect to have found an exit_phi because of loop-closed-ssa
5538          form we can end up without one if the scalar cycle is dead.  */
5539
5540       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5541         {
5542           if (outer_loop)
5543             {
5544               stmt_vec_info exit_phi_vinfo
5545                 = loop_vinfo->lookup_stmt (exit_phi);
5546               gphi *vect_phi;
5547
5548               /* FORNOW. Currently not supporting the case that an inner-loop
5549                  reduction is not used in the outer-loop (but only outside the
5550                  outer-loop), unless it is double reduction.  */
5551               gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5552                            && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
5553                           || double_reduc);
5554
5555               if (double_reduc)
5556                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
5557               else
5558                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt_info;
5559               if (!double_reduc
5560                   || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
5561                       != vect_double_reduction_def)
5562                 continue;
5563
5564               /* Handle double reduction:
5565
5566                  stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
5567                  stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
5568                  stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
5569                  stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
5570
5571                  At that point the regular reduction (stmt2 and stmt3) is
5572                  already vectorized, as well as the exit phi node, stmt4.
5573                  Here we vectorize the phi node of double reduction, stmt1, and
5574                  update all relevant statements.  */
5575
5576               /* Go through all the uses of s2 to find double reduction phi
5577                  node, i.e., stmt1 above.  */
5578               orig_name = PHI_RESULT (exit_phi);
5579               FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5580                 {
5581                   stmt_vec_info use_stmt_vinfo;
5582                   tree vect_phi_init, preheader_arg, vect_phi_res;
5583                   basic_block bb = gimple_bb (use_stmt);
5584
5585                   /* Check that USE_STMT is really double reduction phi
5586                      node.  */
5587                   if (gimple_code (use_stmt) != GIMPLE_PHI
5588                       || gimple_phi_num_args (use_stmt) != 2
5589                       || bb->loop_father != outer_loop)
5590                     continue;
5591                   use_stmt_vinfo = loop_vinfo->lookup_stmt (use_stmt);
5592                   if (!use_stmt_vinfo
5593                       || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
5594                           != vect_double_reduction_def)
5595                     continue;
5596
5597                   /* Create vector phi node for double reduction:
5598                      vs1 = phi <vs0, vs2>
5599                      vs1 was created previously in this function by a call to
5600                        vect_get_vec_def_for_operand and is stored in
5601                        vec_initial_def;
5602                      vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
5603                      vs0 is created here.  */
5604
5605                   /* Create vector phi node.  */
5606                   vect_phi = create_phi_node (vec_initial_def, bb);
5607                   loop_vec_info_for_loop (outer_loop)->add_stmt (vect_phi);
5608
5609                   /* Create vs0 - initial def of the double reduction phi.  */
5610                   preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
5611                                              loop_preheader_edge (outer_loop));
5612                   vect_phi_init = get_initial_def_for_reduction
5613                     (stmt_info, preheader_arg, NULL);
5614
5615                   /* Update phi node arguments with vs0 and vs2.  */
5616                   add_phi_arg (vect_phi, vect_phi_init,
5617                                loop_preheader_edge (outer_loop),
5618                                UNKNOWN_LOCATION);
5619                   add_phi_arg (vect_phi, PHI_RESULT (inner_phi->stmt),
5620                                loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
5621                   if (dump_enabled_p ())
5622                     dump_printf_loc (MSG_NOTE, vect_location,
5623                                      "created double reduction phi node: %G",
5624                                      vect_phi);
5625
5626                   vect_phi_res = PHI_RESULT (vect_phi);
5627
5628                   /* Replace the use, i.e., set the correct vs1 in the regular
5629                      reduction phi node.  FORNOW, NCOPIES is always 1, so the
5630                      loop is redundant.  */
5631                   stmt_vec_info use_info = reduction_phi_info;
5632                   for (j = 0; j < ncopies; j++)
5633                     {
5634                       edge pr_edge = loop_preheader_edge (loop);
5635                       SET_PHI_ARG_DEF (as_a <gphi *> (use_info->stmt),
5636                                        pr_edge->dest_idx, vect_phi_res);
5637                       use_info = STMT_VINFO_RELATED_STMT (use_info);
5638                     }
5639                 }
5640             }
5641         }
5642
5643       phis.release ();
5644       if (nested_in_vect_loop)
5645         {
5646           if (double_reduc)
5647             loop = outer_loop;
5648           else
5649             continue;
5650         }
5651
5652       phis.create (3);
5653       /* Find the loop-closed-use at the loop exit of the original scalar
5654          result.  (The reduction result is expected to have two immediate uses,
5655          one at the latch block, and one at the loop exit).  For double
5656          reductions we are looking for exit phis of the outer loop.  */
5657       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5658         {
5659           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5660             {
5661               if (!is_gimple_debug (USE_STMT (use_p)))
5662                 phis.safe_push (USE_STMT (use_p));
5663             }
5664           else
5665             {
5666               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5667                 {
5668                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
5669
5670                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5671                     {
5672                       if (!flow_bb_inside_loop_p (loop,
5673                                              gimple_bb (USE_STMT (phi_use_p)))
5674                           && !is_gimple_debug (USE_STMT (phi_use_p)))
5675                         phis.safe_push (USE_STMT (phi_use_p));
5676                     }
5677                 }
5678             }
5679         }
5680
5681       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5682         {
5683           /* Replace the uses:  */
5684           orig_name = PHI_RESULT (exit_phi);
5685           scalar_result = scalar_results[k];
5686           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5687             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5688               SET_USE (use_p, scalar_result);
5689         }
5690
5691       phis.release ();
5692     }
5693 }
5694
5695 /* Return a vector of type VECTYPE that is equal to the vector select
5696    operation "MASK ? VEC : IDENTITY".  Insert the select statements
5697    before GSI.  */
5698
5699 static tree
5700 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
5701                      tree vec, tree identity)
5702 {
5703   tree cond = make_temp_ssa_name (vectype, NULL, "cond");
5704   gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
5705                                           mask, vec, identity);
5706   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5707   return cond;
5708 }
5709
5710 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
5711    order, starting with LHS.  Insert the extraction statements before GSI and
5712    associate the new scalar SSA names with variable SCALAR_DEST.
5713    Return the SSA name for the result.  */
5714
5715 static tree
5716 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
5717                        tree_code code, tree lhs, tree vector_rhs)
5718 {
5719   tree vectype = TREE_TYPE (vector_rhs);
5720   tree scalar_type = TREE_TYPE (vectype);
5721   tree bitsize = TYPE_SIZE (scalar_type);
5722   unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5723   unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
5724
5725   for (unsigned HOST_WIDE_INT bit_offset = 0;
5726        bit_offset < vec_size_in_bits;
5727        bit_offset += element_bitsize)
5728     {
5729       tree bitpos = bitsize_int (bit_offset);
5730       tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
5731                          bitsize, bitpos);
5732
5733       gassign *stmt = gimple_build_assign (scalar_dest, rhs);
5734       rhs = make_ssa_name (scalar_dest, stmt);
5735       gimple_assign_set_lhs (stmt, rhs);
5736       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5737
5738       stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
5739       tree new_name = make_ssa_name (scalar_dest, stmt);
5740       gimple_assign_set_lhs (stmt, new_name);
5741       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5742       lhs = new_name;
5743     }
5744   return lhs;
5745 }
5746
5747 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT_INFO is the
5748    statement that sets the live-out value.  REDUC_DEF_STMT is the phi
5749    statement.  CODE is the operation performed by STMT_INFO and OPS are
5750    its scalar operands.  REDUC_INDEX is the index of the operand in
5751    OPS that is set by REDUC_DEF_STMT.  REDUC_FN is the function that
5752    implements in-order reduction, or IFN_LAST if we should open-code it.
5753    VECTYPE_IN is the type of the vector input.  MASKS specifies the masks
5754    that should be used to control the operation in a fully-masked loop.  */
5755
5756 static bool
5757 vectorize_fold_left_reduction (stmt_vec_info stmt_info,
5758                                gimple_stmt_iterator *gsi,
5759                                stmt_vec_info *vec_stmt, slp_tree slp_node,
5760                                gimple *reduc_def_stmt,
5761                                tree_code code, internal_fn reduc_fn,
5762                                tree ops[3], tree vectype_in,
5763                                int reduc_index, vec_loop_masks *masks)
5764 {
5765   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5766   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5767   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5768   stmt_vec_info new_stmt_info = NULL;
5769
5770   int ncopies;
5771   if (slp_node)
5772     ncopies = 1;
5773   else
5774     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
5775
5776   gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
5777   gcc_assert (ncopies == 1);
5778   gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
5779   gcc_assert (reduc_index == (code == MINUS_EXPR ? 0 : 1));
5780   gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5781               == FOLD_LEFT_REDUCTION);
5782
5783   if (slp_node)
5784     gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
5785                           TYPE_VECTOR_SUBPARTS (vectype_in)));
5786
5787   tree op0 = ops[1 - reduc_index];
5788
5789   int group_size = 1;
5790   stmt_vec_info scalar_dest_def_info;
5791   auto_vec<tree> vec_oprnds0;
5792   if (slp_node)
5793     {
5794       vect_get_vec_defs (op0, NULL_TREE, stmt_info, &vec_oprnds0, NULL,
5795                          slp_node);
5796       group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
5797       scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5798     }
5799   else
5800     {
5801       tree loop_vec_def0 = vect_get_vec_def_for_operand (op0, stmt_info);
5802       vec_oprnds0.create (1);
5803       vec_oprnds0.quick_push (loop_vec_def0);
5804       scalar_dest_def_info = stmt_info;
5805     }
5806
5807   tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
5808   tree scalar_type = TREE_TYPE (scalar_dest);
5809   tree reduc_var = gimple_phi_result (reduc_def_stmt);
5810
5811   int vec_num = vec_oprnds0.length ();
5812   gcc_assert (vec_num == 1 || slp_node);
5813   tree vec_elem_type = TREE_TYPE (vectype_out);
5814   gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
5815
5816   tree vector_identity = NULL_TREE;
5817   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5818     vector_identity = build_zero_cst (vectype_out);
5819
5820   tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
5821   int i;
5822   tree def0;
5823   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5824     {
5825       gimple *new_stmt;
5826       tree mask = NULL_TREE;
5827       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5828         mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
5829
5830       /* Handle MINUS by adding the negative.  */
5831       if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
5832         {
5833           tree negated = make_ssa_name (vectype_out);
5834           new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
5835           gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5836           def0 = negated;
5837         }
5838
5839       if (mask)
5840         def0 = merge_with_identity (gsi, mask, vectype_out, def0,
5841                                     vector_identity);
5842
5843       /* On the first iteration the input is simply the scalar phi
5844          result, and for subsequent iterations it is the output of
5845          the preceding operation.  */
5846       if (reduc_fn != IFN_LAST)
5847         {
5848           new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var, def0);
5849           /* For chained SLP reductions the output of the previous reduction
5850              operation serves as the input of the next. For the final statement
5851              the output cannot be a temporary - we reuse the original
5852              scalar destination of the last statement.  */
5853           if (i != vec_num - 1)
5854             {
5855               gimple_set_lhs (new_stmt, scalar_dest_var);
5856               reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
5857               gimple_set_lhs (new_stmt, reduc_var);
5858             }
5859         }
5860       else
5861         {
5862           reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
5863                                              reduc_var, def0);
5864           new_stmt = SSA_NAME_DEF_STMT (reduc_var);
5865           /* Remove the statement, so that we can use the same code paths
5866              as for statements that we've just created.  */
5867           gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
5868           gsi_remove (&tmp_gsi, false);
5869         }
5870
5871       if (i == vec_num - 1)
5872         {
5873           gimple_set_lhs (new_stmt, scalar_dest);
5874           new_stmt_info = vect_finish_replace_stmt (scalar_dest_def_info,
5875                                                     new_stmt);
5876         }
5877       else
5878         new_stmt_info = vect_finish_stmt_generation (scalar_dest_def_info,
5879                                                      new_stmt, gsi);
5880
5881       if (slp_node)
5882         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
5883     }
5884
5885   if (!slp_node)
5886     STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
5887
5888   return true;
5889 }
5890
5891 /* Function is_nonwrapping_integer_induction.
5892
5893    Check if STMT_VINO (which is part of loop LOOP) both increments and
5894    does not cause overflow.  */
5895
5896 static bool
5897 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, struct loop *loop)
5898 {
5899   gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
5900   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
5901   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
5902   tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
5903   widest_int ni, max_loop_value, lhs_max;
5904   wi::overflow_type overflow = wi::OVF_NONE;
5905
5906   /* Make sure the loop is integer based.  */
5907   if (TREE_CODE (base) != INTEGER_CST
5908       || TREE_CODE (step) != INTEGER_CST)
5909     return false;
5910
5911   /* Check that the max size of the loop will not wrap.  */
5912
5913   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
5914     return true;
5915
5916   if (! max_stmt_executions (loop, &ni))
5917     return false;
5918
5919   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
5920                             &overflow);
5921   if (overflow)
5922     return false;
5923
5924   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
5925                             TYPE_SIGN (lhs_type), &overflow);
5926   if (overflow)
5927     return false;
5928
5929   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
5930           <= TYPE_PRECISION (lhs_type));
5931 }
5932
5933 /* Function vectorizable_reduction.
5934
5935    Check if STMT_INFO performs a reduction operation that can be vectorized.
5936    If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
5937    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
5938    Return true if STMT_INFO is vectorizable in this way.
5939
5940    This function also handles reduction idioms (patterns) that have been
5941    recognized in advance during vect_pattern_recog.  In this case, STMT_INFO
5942    may be of this form:
5943      X = pattern_expr (arg0, arg1, ..., X)
5944    and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
5945    sequence that had been detected and replaced by the pattern-stmt
5946    (STMT_INFO).
5947
5948    This function also handles reduction of condition expressions, for example:
5949      for (int i = 0; i < N; i++)
5950        if (a[i] < value)
5951          last = a[i];
5952    This is handled by vectorising the loop and creating an additional vector
5953    containing the loop indexes for which "a[i] < value" was true.  In the
5954    function epilogue this is reduced to a single max value and then used to
5955    index into the vector of results.
5956
5957    In some cases of reduction patterns, the type of the reduction variable X is
5958    different than the type of the other arguments of STMT_INFO.
5959    In such cases, the vectype that is used when transforming STMT_INFO into
5960    a vector stmt is different than the vectype that is used to determine the
5961    vectorization factor, because it consists of a different number of elements
5962    than the actual number of elements that are being operated upon in parallel.
5963
5964    For example, consider an accumulation of shorts into an int accumulator.
5965    On some targets it's possible to vectorize this pattern operating on 8
5966    shorts at a time (hence, the vectype for purposes of determining the
5967    vectorization factor should be V8HI); on the other hand, the vectype that
5968    is used to create the vector form is actually V4SI (the type of the result).
5969
5970    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
5971    indicates what is the actual level of parallelism (V8HI in the example), so
5972    that the right vectorization factor would be derived.  This vectype
5973    corresponds to the type of arguments to the reduction stmt, and should *NOT*
5974    be used to create the vectorized stmt.  The right vectype for the vectorized
5975    stmt is obtained from the type of the result X:
5976         get_vectype_for_scalar_type (TREE_TYPE (X))
5977
5978    This means that, contrary to "regular" reductions (or "regular" stmts in
5979    general), the following equation:
5980       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
5981    does *NOT* necessarily hold for reduction patterns.  */
5982
5983 bool
5984 vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
5985                         stmt_vec_info *vec_stmt, slp_tree slp_node,
5986                         slp_instance slp_node_instance,
5987                         stmt_vector_for_cost *cost_vec)
5988 {
5989   tree vec_dest;
5990   tree scalar_dest;
5991   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5992   tree vectype_in = NULL_TREE;
5993   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5994   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5995   enum tree_code code, orig_code;
5996   internal_fn reduc_fn;
5997   machine_mode vec_mode;
5998   int op_type;
5999   optab optab;
6000   tree new_temp = NULL_TREE;
6001   enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type;
6002   stmt_vec_info cond_stmt_vinfo = NULL;
6003   enum tree_code cond_reduc_op_code = ERROR_MARK;
6004   tree scalar_type;
6005   bool is_simple_use;
6006   int i;
6007   int ncopies;
6008   int epilog_copies;
6009   stmt_vec_info prev_stmt_info, prev_phi_info;
6010   bool single_defuse_cycle = false;
6011   stmt_vec_info new_stmt_info = NULL;
6012   int j;
6013   tree ops[3];
6014   enum vect_def_type dts[3];
6015   bool nested_cycle = false, found_nested_cycle_def = false;
6016   bool double_reduc = false;
6017   basic_block def_bb;
6018   struct loop * def_stmt_loop;
6019   tree def_arg;
6020   auto_vec<tree> vec_oprnds0;
6021   auto_vec<tree> vec_oprnds1;
6022   auto_vec<tree> vec_oprnds2;
6023   auto_vec<tree> vect_defs;
6024   auto_vec<stmt_vec_info> phis;
6025   int vec_num;
6026   tree def0, tem;
6027   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6028   tree cond_reduc_val = NULL_TREE;
6029
6030   /* Make sure it was already recognized as a reduction computation.  */
6031   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
6032       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
6033     return false;
6034
6035   if (nested_in_vect_loop_p (loop, stmt_info))
6036     {
6037       loop = loop->inner;
6038       nested_cycle = true;
6039     }
6040
6041   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6042     gcc_assert (slp_node
6043                 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
6044
6045   if (gphi *phi = dyn_cast <gphi *> (stmt_info->stmt))
6046     {
6047       tree phi_result = gimple_phi_result (phi);
6048       /* Analysis is fully done on the reduction stmt invocation.  */
6049       if (! vec_stmt)
6050         {
6051           if (slp_node)
6052             slp_node_instance->reduc_phis = slp_node;
6053
6054           STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6055           return true;
6056         }
6057
6058       if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6059         /* Leave the scalar phi in place.  Note that checking
6060            STMT_VINFO_VEC_REDUCTION_TYPE (as below) only works
6061            for reductions involving a single statement.  */
6062         return true;
6063
6064       stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
6065       reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
6066
6067       if (STMT_VINFO_VEC_REDUCTION_TYPE (reduc_stmt_info)
6068           == EXTRACT_LAST_REDUCTION)
6069         /* Leave the scalar phi in place.  */
6070         return true;
6071
6072       gassign *reduc_stmt = as_a <gassign *> (reduc_stmt_info->stmt);
6073       for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k)
6074         {
6075           tree op = gimple_op (reduc_stmt, k);
6076           if (op == phi_result)
6077             continue;
6078           if (k == 1
6079               && gimple_assign_rhs_code (reduc_stmt) == COND_EXPR)
6080             continue;
6081           if (!vectype_in
6082               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6083                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (op)))))
6084             vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op));
6085           break;
6086         }
6087       /* For a nested cycle we might end up with an operation like
6088          phi_result * phi_result.  */
6089       if (!vectype_in)
6090         vectype_in = STMT_VINFO_VECTYPE (stmt_info);
6091       gcc_assert (vectype_in);
6092
6093       if (slp_node)
6094         ncopies = 1;
6095       else
6096         ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6097
6098       stmt_vec_info use_stmt_info;
6099       if (ncopies > 1
6100           && STMT_VINFO_RELEVANT (reduc_stmt_info) <= vect_used_only_live
6101           && (use_stmt_info = loop_vinfo->lookup_single_use (phi_result))
6102           && vect_stmt_to_vectorize (use_stmt_info) == reduc_stmt_info)
6103         single_defuse_cycle = true;
6104
6105       /* Create the destination vector  */
6106       scalar_dest = gimple_assign_lhs (reduc_stmt);
6107       vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6108
6109       if (slp_node)
6110         /* The size vect_schedule_slp_instance computes is off for us.  */
6111         vec_num = vect_get_num_vectors
6112           (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6113            * SLP_TREE_SCALAR_STMTS (slp_node).length (),
6114            vectype_in);
6115       else
6116         vec_num = 1;
6117
6118       /* Generate the reduction PHIs upfront.  */
6119       prev_phi_info = NULL;
6120       for (j = 0; j < ncopies; j++)
6121         {
6122           if (j == 0 || !single_defuse_cycle)
6123             {
6124               for (i = 0; i < vec_num; i++)
6125                 {
6126                   /* Create the reduction-phi that defines the reduction
6127                      operand.  */
6128                   gimple *new_phi = create_phi_node (vec_dest, loop->header);
6129                   stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
6130
6131                   if (slp_node)
6132                     SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi_info);
6133                   else
6134                     {
6135                       if (j == 0)
6136                         STMT_VINFO_VEC_STMT (stmt_info)
6137                           = *vec_stmt = new_phi_info;
6138                       else
6139                         STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi_info;
6140                       prev_phi_info = new_phi_info;
6141                     }
6142                 }
6143             }
6144         }
6145
6146       return true;
6147     }
6148
6149   /* 1. Is vectorizable reduction?  */
6150   /* Not supportable if the reduction variable is used in the loop, unless
6151      it's a reduction chain.  */
6152   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6153       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6154     return false;
6155
6156   /* Reductions that are not used even in an enclosing outer-loop,
6157      are expected to be "live" (used out of the loop).  */
6158   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6159       && !STMT_VINFO_LIVE_P (stmt_info))
6160     return false;
6161
6162   /* 2. Has this been recognized as a reduction pattern?
6163
6164      Check if STMT represents a pattern that has been recognized
6165      in earlier analysis stages.  For stmts that represent a pattern,
6166      the STMT_VINFO_RELATED_STMT field records the last stmt in
6167      the original sequence that constitutes the pattern.  */
6168
6169   stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6170   if (orig_stmt_info)
6171     {
6172       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6173       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6174     }
6175
6176   /* 3. Check the operands of the operation.  The first operands are defined
6177         inside the loop body. The last operand is the reduction variable,
6178         which is defined by the loop-header-phi.  */
6179
6180   gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6181
6182   /* Flatten RHS.  */
6183   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
6184     {
6185     case GIMPLE_BINARY_RHS:
6186       code = gimple_assign_rhs_code (stmt);
6187       op_type = TREE_CODE_LENGTH (code);
6188       gcc_assert (op_type == binary_op);
6189       ops[0] = gimple_assign_rhs1 (stmt);
6190       ops[1] = gimple_assign_rhs2 (stmt);
6191       break;
6192
6193     case GIMPLE_TERNARY_RHS:
6194       code = gimple_assign_rhs_code (stmt);
6195       op_type = TREE_CODE_LENGTH (code);
6196       gcc_assert (op_type == ternary_op);
6197       ops[0] = gimple_assign_rhs1 (stmt);
6198       ops[1] = gimple_assign_rhs2 (stmt);
6199       ops[2] = gimple_assign_rhs3 (stmt);
6200       break;
6201
6202     case GIMPLE_UNARY_RHS:
6203       return false;
6204
6205     default:
6206       gcc_unreachable ();
6207     }
6208
6209   if (code == COND_EXPR && slp_node)
6210     return false;
6211
6212   scalar_dest = gimple_assign_lhs (stmt);
6213   scalar_type = TREE_TYPE (scalar_dest);
6214   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6215       && !SCALAR_FLOAT_TYPE_P (scalar_type))
6216     return false;
6217
6218   /* Do not try to vectorize bit-precision reductions.  */
6219   if (!type_has_mode_precision_p (scalar_type))
6220     return false;
6221
6222   /* All uses but the last are expected to be defined in the loop.
6223      The last use is the reduction variable.  In case of nested cycle this
6224      assumption is not true: we use reduc_index to record the index of the
6225      reduction variable.  */
6226   stmt_vec_info reduc_def_info = NULL;
6227   int reduc_index = -1;
6228   for (i = 0; i < op_type; i++)
6229     {
6230       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
6231       if (i == 0 && code == COND_EXPR)
6232         continue;
6233
6234       stmt_vec_info def_stmt_info;
6235       is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, &dts[i], &tem,
6236                                           &def_stmt_info);
6237       dt = dts[i];
6238       gcc_assert (is_simple_use);
6239       if (dt == vect_reduction_def)
6240         {
6241           reduc_def_info = def_stmt_info;
6242           reduc_index = i;
6243           continue;
6244         }
6245       else if (tem)
6246         {
6247           /* To properly compute ncopies we are interested in the widest
6248              input type in case we're looking at a widening accumulation.  */
6249           if (!vectype_in
6250               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6251                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem)))))
6252             vectype_in = tem;
6253         }
6254
6255       if (dt != vect_internal_def
6256           && dt != vect_external_def
6257           && dt != vect_constant_def
6258           && dt != vect_induction_def
6259           && !(dt == vect_nested_cycle && nested_cycle))
6260         return false;
6261
6262       if (dt == vect_nested_cycle)
6263         {
6264           found_nested_cycle_def = true;
6265           reduc_def_info = def_stmt_info;
6266           reduc_index = i;
6267         }
6268
6269       if (i == 1 && code == COND_EXPR)
6270         {
6271           /* Record how value of COND_EXPR is defined.  */
6272           if (dt == vect_constant_def)
6273             {
6274               cond_reduc_dt = dt;
6275               cond_reduc_val = ops[i];
6276             }
6277           if (dt == vect_induction_def
6278               && def_stmt_info
6279               && is_nonwrapping_integer_induction (def_stmt_info, loop))
6280             {
6281               cond_reduc_dt = dt;
6282               cond_stmt_vinfo = def_stmt_info;
6283             }
6284         }
6285     }
6286
6287   if (!vectype_in)
6288     vectype_in = vectype_out;
6289
6290   /* When vectorizing a reduction chain w/o SLP the reduction PHI is not
6291      directy used in stmt.  */
6292   if (reduc_index == -1)
6293     {
6294       if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6295         {
6296           if (dump_enabled_p ())
6297             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6298                              "in-order reduction chain without SLP.\n");
6299           return false;
6300         }
6301
6302       if (orig_stmt_info)
6303         reduc_def_info = STMT_VINFO_REDUC_DEF (orig_stmt_info);
6304       else
6305         reduc_def_info = STMT_VINFO_REDUC_DEF (stmt_info);
6306     }
6307
6308   if (! reduc_def_info)
6309     return false;
6310
6311   gphi *reduc_def_phi = dyn_cast <gphi *> (reduc_def_info->stmt);
6312   if (!reduc_def_phi)
6313     return false;
6314
6315   if (!(reduc_index == -1
6316         || dts[reduc_index] == vect_reduction_def
6317         || dts[reduc_index] == vect_nested_cycle
6318         || ((dts[reduc_index] == vect_internal_def
6319              || dts[reduc_index] == vect_external_def
6320              || dts[reduc_index] == vect_constant_def
6321              || dts[reduc_index] == vect_induction_def)
6322             && nested_cycle && found_nested_cycle_def)))
6323     {
6324       /* For pattern recognized stmts, orig_stmt might be a reduction,
6325          but some helper statements for the pattern might not, or
6326          might be COND_EXPRs with reduction uses in the condition.  */
6327       gcc_assert (orig_stmt_info);
6328       return false;
6329     }
6330
6331   /* PHIs should not participate in patterns.  */
6332   gcc_assert (!STMT_VINFO_RELATED_STMT (reduc_def_info));
6333   enum vect_reduction_type v_reduc_type
6334     = STMT_VINFO_REDUC_TYPE (reduc_def_info);
6335   stmt_vec_info tmp = STMT_VINFO_REDUC_DEF (reduc_def_info);
6336
6337   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
6338   /* If we have a condition reduction, see if we can simplify it further.  */
6339   if (v_reduc_type == COND_REDUCTION)
6340     {
6341       /* TODO: We can't yet handle reduction chains, since we need to treat
6342          each COND_EXPR in the chain specially, not just the last one.
6343          E.g. for:
6344
6345             x_1 = PHI <x_3, ...>
6346             x_2 = a_2 ? ... : x_1;
6347             x_3 = a_3 ? ... : x_2;
6348
6349          we're interested in the last element in x_3 for which a_2 || a_3
6350          is true, whereas the current reduction chain handling would
6351          vectorize x_2 as a normal VEC_COND_EXPR and only treat x_3
6352          as a reduction operation.  */
6353       if (reduc_index == -1)
6354         {
6355           if (dump_enabled_p ())
6356             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6357                              "conditional reduction chains not supported\n");
6358           return false;
6359         }
6360
6361       /* vect_is_simple_reduction ensured that operand 2 is the
6362          loop-carried operand.  */
6363       gcc_assert (reduc_index == 2);
6364
6365       /* Loop peeling modifies initial value of reduction PHI, which
6366          makes the reduction stmt to be transformed different to the
6367          original stmt analyzed.  We need to record reduction code for
6368          CONST_COND_REDUCTION type reduction at analyzing stage, thus
6369          it can be used directly at transform stage.  */
6370       if (STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MAX_EXPR
6371           || STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MIN_EXPR)
6372         {
6373           /* Also set the reduction type to CONST_COND_REDUCTION.  */
6374           gcc_assert (cond_reduc_dt == vect_constant_def);
6375           STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = CONST_COND_REDUCTION;
6376         }
6377       else if (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6378                                                vectype_in, OPTIMIZE_FOR_SPEED))
6379         {
6380           if (dump_enabled_p ())
6381             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6382                              "optimizing condition reduction with"
6383                              " FOLD_EXTRACT_LAST.\n");
6384           STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = EXTRACT_LAST_REDUCTION;
6385         }
6386       else if (cond_reduc_dt == vect_induction_def)
6387         {
6388           tree base
6389             = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6390           tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6391
6392           gcc_assert (TREE_CODE (base) == INTEGER_CST
6393                       && TREE_CODE (step) == INTEGER_CST);
6394           cond_reduc_val = NULL_TREE;
6395           /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6396              above base; punt if base is the minimum value of the type for
6397              MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
6398           if (tree_int_cst_sgn (step) == -1)
6399             {
6400               cond_reduc_op_code = MIN_EXPR;
6401               if (tree_int_cst_sgn (base) == -1)
6402                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6403               else if (tree_int_cst_lt (base,
6404                                         TYPE_MAX_VALUE (TREE_TYPE (base))))
6405                 cond_reduc_val
6406                   = int_const_binop (PLUS_EXPR, base, integer_one_node);
6407             }
6408           else
6409             {
6410               cond_reduc_op_code = MAX_EXPR;
6411               if (tree_int_cst_sgn (base) == 1)
6412                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6413               else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6414                                         base))
6415                 cond_reduc_val
6416                   = int_const_binop (MINUS_EXPR, base, integer_one_node);
6417             }
6418           if (cond_reduc_val)
6419             {
6420               if (dump_enabled_p ())
6421                 dump_printf_loc (MSG_NOTE, vect_location,
6422                                  "condition expression based on "
6423                                  "integer induction.\n");
6424               STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6425                 = INTEGER_INDUC_COND_REDUCTION;
6426             }
6427         }
6428       else if (cond_reduc_dt == vect_constant_def)
6429         {
6430           enum vect_def_type cond_initial_dt;
6431           gimple *def_stmt = SSA_NAME_DEF_STMT (ops[reduc_index]);
6432           tree cond_initial_val
6433             = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
6434
6435           gcc_assert (cond_reduc_val != NULL_TREE);
6436           vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
6437           if (cond_initial_dt == vect_constant_def
6438               && types_compatible_p (TREE_TYPE (cond_initial_val),
6439                                      TREE_TYPE (cond_reduc_val)))
6440             {
6441               tree e = fold_binary (LE_EXPR, boolean_type_node,
6442                                     cond_initial_val, cond_reduc_val);
6443               if (e && (integer_onep (e) || integer_zerop (e)))
6444                 {
6445                   if (dump_enabled_p ())
6446                     dump_printf_loc (MSG_NOTE, vect_location,
6447                                      "condition expression based on "
6448                                      "compile time constant.\n");
6449                   /* Record reduction code at analysis stage.  */
6450                   STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info)
6451                     = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6452                   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6453                     = CONST_COND_REDUCTION;
6454                 }
6455             }
6456         }
6457     }
6458
6459   if (orig_stmt_info)
6460     gcc_assert (tmp == orig_stmt_info
6461                 || REDUC_GROUP_FIRST_ELEMENT (tmp) == orig_stmt_info);
6462   else
6463     /* We changed STMT to be the first stmt in reduction chain, hence we
6464        check that in this case the first element in the chain is STMT.  */
6465     gcc_assert (tmp == stmt_info
6466                 || REDUC_GROUP_FIRST_ELEMENT (tmp) == stmt_info);
6467
6468   if (STMT_VINFO_LIVE_P (reduc_def_info))
6469     return false;
6470
6471   if (slp_node)
6472     ncopies = 1;
6473   else
6474     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6475
6476   gcc_assert (ncopies >= 1);
6477
6478   vec_mode = TYPE_MODE (vectype_in);
6479   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6480
6481   if (nested_cycle)
6482     {
6483       def_bb = gimple_bb (reduc_def_phi);
6484       def_stmt_loop = def_bb->loop_father;
6485       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
6486                                        loop_preheader_edge (def_stmt_loop));
6487       stmt_vec_info def_arg_stmt_info = loop_vinfo->lookup_def (def_arg);
6488       if (def_arg_stmt_info
6489           && (STMT_VINFO_DEF_TYPE (def_arg_stmt_info)
6490               == vect_double_reduction_def))
6491         double_reduc = true;
6492     }
6493
6494   if (code == COND_EXPR)
6495     {
6496       /* Only call during the analysis stage, otherwise we'll lose
6497          STMT_VINFO_TYPE.  We'll pass ops[0] as reduc_op, it's only
6498          used as a flag during analysis.  */
6499       if (!vec_stmt && !vectorizable_condition (stmt_info, gsi, NULL,
6500                                                 ops[0], 0, NULL,
6501                                                 cost_vec))
6502         {
6503           if (dump_enabled_p ())
6504             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6505                              "unsupported condition in reduction\n");
6506           return false;
6507         }
6508     }
6509   else if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
6510            || code == LROTATE_EXPR || code == RROTATE_EXPR)
6511     {
6512       /* Only call during the analysis stage, otherwise we'll lose
6513          STMT_VINFO_TYPE.  We only support this for nested cycles
6514          without double reductions at the moment.  */
6515       if (!nested_cycle
6516           || double_reduc
6517           || (!vec_stmt && !vectorizable_shift (stmt_info, gsi, NULL,
6518                                                 NULL, cost_vec)))
6519         {
6520           if (dump_enabled_p ())
6521             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6522                              "unsupported shift or rotation in reduction\n");
6523           return false;
6524         }
6525     }
6526   else
6527     {
6528       /* 4. Supportable by target?  */
6529
6530       /* 4.1. check support for the operation in the loop  */
6531       optab = optab_for_tree_code (code, vectype_in, optab_default);
6532       if (!optab)
6533         {
6534           if (dump_enabled_p ())
6535             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6536                              "no optab.\n");
6537
6538           return false;
6539         }
6540
6541       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6542         {
6543           if (dump_enabled_p ())
6544             dump_printf (MSG_NOTE, "op not supported by target.\n");
6545
6546           if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
6547               || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6548             return false;
6549
6550           if (dump_enabled_p ())
6551             dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6552         }
6553
6554       /* Worthwhile without SIMD support?  */
6555       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
6556           && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6557         {
6558           if (dump_enabled_p ())
6559             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6560                              "not worthwhile without SIMD support.\n");
6561
6562           return false;
6563         }
6564     }
6565
6566   /* 4.2. Check support for the epilog operation.
6567
6568           If STMT represents a reduction pattern, then the type of the
6569           reduction variable may be different than the type of the rest
6570           of the arguments.  For example, consider the case of accumulation
6571           of shorts into an int accumulator; The original code:
6572                         S1: int_a = (int) short_a;
6573           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
6574
6575           was replaced with:
6576                         STMT: int_acc = widen_sum <short_a, int_acc>
6577
6578           This means that:
6579           1. The tree-code that is used to create the vector operation in the
6580              epilog code (that reduces the partial results) is not the
6581              tree-code of STMT, but is rather the tree-code of the original
6582              stmt from the pattern that STMT is replacing.  I.e, in the example
6583              above we want to use 'widen_sum' in the loop, but 'plus' in the
6584              epilog.
6585           2. The type (mode) we use to check available target support
6586              for the vector operation to be created in the *epilog*, is
6587              determined by the type of the reduction variable (in the example
6588              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6589              However the type (mode) we use to check available target support
6590              for the vector operation to be created *inside the loop*, is
6591              determined by the type of the other arguments to STMT (in the
6592              example we'd check this: optab_handler (widen_sum_optab,
6593              vect_short_mode)).
6594
6595           This is contrary to "regular" reductions, in which the types of all
6596           the arguments are the same as the type of the reduction variable.
6597           For "regular" reductions we can therefore use the same vector type
6598           (and also the same tree-code) when generating the epilog code and
6599           when generating the code inside the loop.  */
6600
6601   vect_reduction_type reduction_type
6602     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
6603   if (orig_stmt_info
6604       && (reduction_type == TREE_CODE_REDUCTION
6605           || reduction_type == FOLD_LEFT_REDUCTION))
6606     {
6607       /* This is a reduction pattern: get the vectype from the type of the
6608          reduction variable, and get the tree-code from orig_stmt.  */
6609       orig_code = gimple_assign_rhs_code (orig_stmt_info->stmt);
6610       gcc_assert (vectype_out);
6611       vec_mode = TYPE_MODE (vectype_out);
6612     }
6613   else
6614     {
6615       /* Regular reduction: use the same vectype and tree-code as used for
6616          the vector code inside the loop can be used for the epilog code. */
6617       orig_code = code;
6618
6619       if (code == MINUS_EXPR)
6620         orig_code = PLUS_EXPR;
6621
6622       /* For simple condition reductions, replace with the actual expression
6623          we want to base our reduction around.  */
6624       if (reduction_type == CONST_COND_REDUCTION)
6625         {
6626           orig_code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
6627           gcc_assert (orig_code == MAX_EXPR || orig_code == MIN_EXPR);
6628         }
6629       else if (reduction_type == INTEGER_INDUC_COND_REDUCTION)
6630         orig_code = cond_reduc_op_code;
6631     }
6632
6633   reduc_fn = IFN_LAST;
6634
6635   if (reduction_type == TREE_CODE_REDUCTION
6636       || reduction_type == FOLD_LEFT_REDUCTION
6637       || reduction_type == INTEGER_INDUC_COND_REDUCTION
6638       || reduction_type == CONST_COND_REDUCTION)
6639     {
6640       if (reduction_type == FOLD_LEFT_REDUCTION
6641           ? fold_left_reduction_fn (orig_code, &reduc_fn)
6642           : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6643         {
6644           if (reduc_fn != IFN_LAST
6645               && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6646                                                   OPTIMIZE_FOR_SPEED))
6647             {
6648               if (dump_enabled_p ())
6649                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6650                                  "reduc op not supported by target.\n");
6651
6652               reduc_fn = IFN_LAST;
6653             }
6654         }
6655       else
6656         {
6657           if (!nested_cycle || double_reduc)
6658             {
6659               if (dump_enabled_p ())
6660                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6661                                  "no reduc code for scalar code.\n");
6662
6663               return false;
6664             }
6665         }
6666     }
6667   else if (reduction_type == COND_REDUCTION)
6668     {
6669       int scalar_precision
6670         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6671       cr_index_scalar_type = make_unsigned_type (scalar_precision);
6672       cr_index_vector_type = build_vector_type (cr_index_scalar_type,
6673                                                 nunits_out);
6674
6675       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
6676                                           OPTIMIZE_FOR_SPEED))
6677         reduc_fn = IFN_REDUC_MAX;
6678     }
6679
6680   if (reduction_type != EXTRACT_LAST_REDUCTION
6681       && (!nested_cycle || double_reduc)
6682       && reduc_fn == IFN_LAST
6683       && !nunits_out.is_constant ())
6684     {
6685       if (dump_enabled_p ())
6686         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6687                          "missing target support for reduction on"
6688                          " variable-length vectors.\n");
6689       return false;
6690     }
6691
6692   if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
6693       && ncopies > 1)
6694     {
6695       if (dump_enabled_p ())
6696         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6697                          "multiple types in double reduction or condition "
6698                          "reduction.\n");
6699       return false;
6700     }
6701
6702   /* For SLP reductions, see if there is a neutral value we can use.  */
6703   tree neutral_op = NULL_TREE;
6704   if (slp_node)
6705     neutral_op = neutral_op_for_slp_reduction
6706       (slp_node_instance->reduc_phis, code,
6707        REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL);
6708
6709   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
6710     {
6711       /* We can't support in-order reductions of code such as this:
6712
6713            for (int i = 0; i < n1; ++i)
6714              for (int j = 0; j < n2; ++j)
6715                l += a[j];
6716
6717          since GCC effectively transforms the loop when vectorizing:
6718
6719            for (int i = 0; i < n1 / VF; ++i)
6720              for (int j = 0; j < n2; ++j)
6721                for (int k = 0; k < VF; ++k)
6722                  l += a[j];
6723
6724          which is a reassociation of the original operation.  */
6725       if (dump_enabled_p ())
6726         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6727                          "in-order double reduction not supported.\n");
6728
6729       return false;
6730     }
6731
6732   if (reduction_type == FOLD_LEFT_REDUCTION
6733       && slp_node
6734       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6735     {
6736       /* We cannot use in-order reductions in this case because there is
6737          an implicit reassociation of the operations involved.  */
6738       if (dump_enabled_p ())
6739         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6740                          "in-order unchained SLP reductions not supported.\n");
6741       return false;
6742     }
6743
6744   /* For double reductions, and for SLP reductions with a neutral value,
6745      we construct a variable-length initial vector by loading a vector
6746      full of the neutral value and then shift-and-inserting the start
6747      values into the low-numbered elements.  */
6748   if ((double_reduc || neutral_op)
6749       && !nunits_out.is_constant ()
6750       && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
6751                                           vectype_out, OPTIMIZE_FOR_SPEED))
6752     {
6753       if (dump_enabled_p ())
6754         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6755                          "reduction on variable-length vectors requires"
6756                          " target support for a vector-shift-and-insert"
6757                          " operation.\n");
6758       return false;
6759     }
6760
6761   /* Check extra constraints for variable-length unchained SLP reductions.  */
6762   if (STMT_SLP_TYPE (stmt_info)
6763       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6764       && !nunits_out.is_constant ())
6765     {
6766       /* We checked above that we could build the initial vector when
6767          there's a neutral element value.  Check here for the case in
6768          which each SLP statement has its own initial value and in which
6769          that value needs to be repeated for every instance of the
6770          statement within the initial vector.  */
6771       unsigned int group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6772       scalar_mode elt_mode = SCALAR_TYPE_MODE (TREE_TYPE (vectype_out));
6773       if (!neutral_op
6774           && !can_duplicate_and_interleave_p (group_size, elt_mode))
6775         {
6776           if (dump_enabled_p ())
6777             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6778                              "unsupported form of SLP reduction for"
6779                              " variable-length vectors: cannot build"
6780                              " initial vector.\n");
6781           return false;
6782         }
6783       /* The epilogue code relies on the number of elements being a multiple
6784          of the group size.  The duplicate-and-interleave approach to setting
6785          up the the initial vector does too.  */
6786       if (!multiple_p (nunits_out, group_size))
6787         {
6788           if (dump_enabled_p ())
6789             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6790                              "unsupported form of SLP reduction for"
6791                              " variable-length vectors: the vector size"
6792                              " is not a multiple of the number of results.\n");
6793           return false;
6794         }
6795     }
6796
6797   /* In case of widenning multiplication by a constant, we update the type
6798      of the constant to be the type of the other operand.  We check that the
6799      constant fits the type in the pattern recognition pass.  */
6800   if (code == DOT_PROD_EXPR
6801       && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
6802     {
6803       if (TREE_CODE (ops[0]) == INTEGER_CST)
6804         ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
6805       else if (TREE_CODE (ops[1]) == INTEGER_CST)
6806         ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
6807       else
6808         {
6809           if (dump_enabled_p ())
6810             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6811                              "invalid types in dot-prod\n");
6812
6813           return false;
6814         }
6815     }
6816
6817   if (reduction_type == COND_REDUCTION)
6818     {
6819       widest_int ni;
6820
6821       if (! max_loop_iterations (loop, &ni))
6822         {
6823           if (dump_enabled_p ())
6824             dump_printf_loc (MSG_NOTE, vect_location,
6825                              "loop count not known, cannot create cond "
6826                              "reduction.\n");
6827           return false;
6828         }
6829       /* Convert backedges to iterations.  */
6830       ni += 1;
6831
6832       /* The additional index will be the same type as the condition.  Check
6833          that the loop can fit into this less one (because we'll use up the
6834          zero slot for when there are no matches).  */
6835       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
6836       if (wi::geu_p (ni, wi::to_widest (max_index)))
6837         {
6838           if (dump_enabled_p ())
6839             dump_printf_loc (MSG_NOTE, vect_location,
6840                              "loop size is greater than data size.\n");
6841           return false;
6842         }
6843     }
6844
6845   /* In case the vectorization factor (VF) is bigger than the number
6846      of elements that we can fit in a vectype (nunits), we have to generate
6847      more than one vector stmt - i.e - we need to "unroll" the
6848      vector stmt by a factor VF/nunits.  For more details see documentation
6849      in vectorizable_operation.  */
6850
6851   /* If the reduction is used in an outer loop we need to generate
6852      VF intermediate results, like so (e.g. for ncopies=2):
6853         r0 = phi (init, r0)
6854         r1 = phi (init, r1)
6855         r0 = x0 + r0;
6856         r1 = x1 + r1;
6857     (i.e. we generate VF results in 2 registers).
6858     In this case we have a separate def-use cycle for each copy, and therefore
6859     for each copy we get the vector def for the reduction variable from the
6860     respective phi node created for this copy.
6861
6862     Otherwise (the reduction is unused in the loop nest), we can combine
6863     together intermediate results, like so (e.g. for ncopies=2):
6864         r = phi (init, r)
6865         r = x0 + r;
6866         r = x1 + r;
6867    (i.e. we generate VF/2 results in a single register).
6868    In this case for each copy we get the vector def for the reduction variable
6869    from the vectorized reduction operation generated in the previous iteration.
6870
6871    This only works when we see both the reduction PHI and its only consumer
6872    in vectorizable_reduction and there are no intermediate stmts
6873    participating.  */
6874   stmt_vec_info use_stmt_info;
6875   tree reduc_phi_result = gimple_phi_result (reduc_def_phi);
6876   if (ncopies > 1
6877       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
6878       && (use_stmt_info = loop_vinfo->lookup_single_use (reduc_phi_result))
6879       && vect_stmt_to_vectorize (use_stmt_info) == stmt_info)
6880     {
6881       single_defuse_cycle = true;
6882       epilog_copies = 1;
6883     }
6884   else
6885     epilog_copies = ncopies;
6886
6887   /* If the reduction stmt is one of the patterns that have lane
6888      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
6889   if ((ncopies > 1
6890        && ! single_defuse_cycle)
6891       && (code == DOT_PROD_EXPR
6892           || code == WIDEN_SUM_EXPR
6893           || code == SAD_EXPR))
6894     {
6895       if (dump_enabled_p ())
6896         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6897                          "multi def-use cycle not possible for lane-reducing "
6898                          "reduction operation\n");
6899       return false;
6900     }
6901
6902   if (slp_node)
6903     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6904   else
6905     vec_num = 1;
6906
6907   internal_fn cond_fn = get_conditional_internal_fn (code);
6908   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
6909
6910   if (!vec_stmt) /* transformation not required.  */
6911     {
6912       vect_model_reduction_cost (stmt_info, reduc_fn, ncopies, cost_vec);
6913       if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
6914         {
6915           if (reduction_type != FOLD_LEFT_REDUCTION
6916               && (cond_fn == IFN_LAST
6917                   || !direct_internal_fn_supported_p (cond_fn, vectype_in,
6918                                                       OPTIMIZE_FOR_SPEED)))
6919             {
6920               if (dump_enabled_p ())
6921                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6922                                  "can't use a fully-masked loop because no"
6923                                  " conditional operation is available.\n");
6924               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
6925             }
6926           else if (reduc_index == -1)
6927             {
6928               if (dump_enabled_p ())
6929                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6930                                  "can't use a fully-masked loop for chained"
6931                                  " reductions.\n");
6932               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
6933             }
6934           else
6935             vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
6936                                    vectype_in);
6937         }
6938       if (dump_enabled_p ()
6939           && reduction_type == FOLD_LEFT_REDUCTION)
6940         dump_printf_loc (MSG_NOTE, vect_location,
6941                          "using an in-order (fold-left) reduction.\n");
6942       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6943       return true;
6944     }
6945
6946   /* Transform.  */
6947
6948   if (dump_enabled_p ())
6949     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
6950
6951   /* FORNOW: Multiple types are not supported for condition.  */
6952   if (code == COND_EXPR)
6953     gcc_assert (ncopies == 1);
6954
6955   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
6956
6957   if (reduction_type == FOLD_LEFT_REDUCTION)
6958     return vectorize_fold_left_reduction
6959       (stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, code,
6960        reduc_fn, ops, vectype_in, reduc_index, masks);
6961
6962   if (reduction_type == EXTRACT_LAST_REDUCTION)
6963     {
6964       gcc_assert (!slp_node);
6965       return vectorizable_condition (stmt_info, gsi, vec_stmt,
6966                                      NULL, reduc_index, NULL, NULL);
6967     }
6968
6969   /* Create the destination vector  */
6970   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6971
6972   prev_stmt_info = NULL;
6973   prev_phi_info = NULL;
6974   if (!slp_node)
6975     {
6976       vec_oprnds0.create (1);
6977       vec_oprnds1.create (1);
6978       if (op_type == ternary_op)
6979         vec_oprnds2.create (1);
6980     }
6981
6982   phis.create (vec_num);
6983   vect_defs.create (vec_num);
6984   if (!slp_node)
6985     vect_defs.quick_push (NULL_TREE);
6986
6987   if (slp_node)
6988     phis.splice (SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis));
6989   else
6990     phis.quick_push (STMT_VINFO_VEC_STMT (reduc_def_info));
6991
6992   for (j = 0; j < ncopies; j++)
6993     {
6994       if (code == COND_EXPR)
6995         {
6996           gcc_assert (!slp_node);
6997           vectorizable_condition (stmt_info, gsi, vec_stmt,
6998                                   PHI_RESULT (phis[0]->stmt),
6999                                   reduc_index, NULL, NULL);
7000           /* Multiple types are not supported for condition.  */
7001           break;
7002         }
7003       if (code == LSHIFT_EXPR
7004           || code == RSHIFT_EXPR)
7005         {
7006           vectorizable_shift (stmt_info, gsi, vec_stmt, slp_node, NULL);
7007           break;
7008         }
7009
7010       /* Handle uses.  */
7011       if (j == 0)
7012         {
7013           if (slp_node)
7014             {
7015               /* Get vec defs for all the operands except the reduction index,
7016                  ensuring the ordering of the ops in the vector is kept.  */
7017               auto_vec<tree, 3> slp_ops;
7018               auto_vec<vec<tree>, 3> vec_defs;
7019
7020               slp_ops.quick_push (ops[0]);
7021               slp_ops.quick_push (ops[1]);
7022               if (op_type == ternary_op)
7023                 slp_ops.quick_push (ops[2]);
7024
7025               vect_get_slp_defs (slp_ops, slp_node, &vec_defs);
7026
7027               vec_oprnds0.safe_splice (vec_defs[0]);
7028               vec_defs[0].release ();
7029               vec_oprnds1.safe_splice (vec_defs[1]);
7030               vec_defs[1].release ();
7031               if (op_type == ternary_op)
7032                 {
7033                   vec_oprnds2.safe_splice (vec_defs[2]);
7034                   vec_defs[2].release ();
7035                 }
7036             }
7037           else
7038             {
7039               vec_oprnds0.quick_push
7040                 (vect_get_vec_def_for_operand (ops[0], stmt_info));
7041               vec_oprnds1.quick_push
7042                 (vect_get_vec_def_for_operand (ops[1], stmt_info));
7043               if (op_type == ternary_op)
7044                 vec_oprnds2.quick_push
7045                   (vect_get_vec_def_for_operand (ops[2], stmt_info));
7046             }
7047         }
7048       else
7049         {
7050           if (!slp_node)
7051             {
7052               gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
7053
7054               if (single_defuse_cycle && reduc_index == 0)
7055                 vec_oprnds0[0] = gimple_get_lhs (new_stmt_info->stmt);
7056               else
7057                 vec_oprnds0[0]
7058                   = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7059                                                     vec_oprnds0[0]);
7060               if (single_defuse_cycle && reduc_index == 1)
7061                 vec_oprnds1[0] = gimple_get_lhs (new_stmt_info->stmt);
7062               else
7063                 vec_oprnds1[0]
7064                   = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7065                                                     vec_oprnds1[0]);
7066               if (op_type == ternary_op)
7067                 {
7068                   if (single_defuse_cycle && reduc_index == 2)
7069                     vec_oprnds2[0] = gimple_get_lhs (new_stmt_info->stmt);
7070                   else
7071                     vec_oprnds2[0]
7072                       = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7073                                                         vec_oprnds2[0]);
7074                 }
7075             }
7076         }
7077
7078       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7079         {
7080           tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7081           if (masked_loop_p)
7082             {
7083               /* Make sure that the reduction accumulator is vop[0].  */
7084               if (reduc_index == 1)
7085                 {
7086                   gcc_assert (commutative_tree_code (code));
7087                   std::swap (vop[0], vop[1]);
7088                 }
7089               tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7090                                               vectype_in, i * ncopies + j);
7091               gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
7092                                                         vop[0], vop[1],
7093                                                         vop[0]);
7094               new_temp = make_ssa_name (vec_dest, call);
7095               gimple_call_set_lhs (call, new_temp);
7096               gimple_call_set_nothrow (call, true);
7097               new_stmt_info
7098                 = vect_finish_stmt_generation (stmt_info, call, gsi);
7099             }
7100           else
7101             {
7102               if (op_type == ternary_op)
7103                 vop[2] = vec_oprnds2[i];
7104
7105               gassign *new_stmt = gimple_build_assign (vec_dest, code,
7106                                                        vop[0], vop[1], vop[2]);
7107               new_temp = make_ssa_name (vec_dest, new_stmt);
7108               gimple_assign_set_lhs (new_stmt, new_temp);
7109               new_stmt_info
7110                 = vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
7111             }
7112
7113           if (slp_node)
7114             {
7115               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
7116               vect_defs.quick_push (new_temp);
7117             }
7118           else
7119             vect_defs[0] = new_temp;
7120         }
7121
7122       if (slp_node)
7123         continue;
7124
7125       if (j == 0)
7126         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
7127       else
7128         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
7129
7130       prev_stmt_info = new_stmt_info;
7131     }
7132
7133   /* Finalize the reduction-phi (set its arguments) and create the
7134      epilog reduction code.  */
7135   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
7136     vect_defs[0] = gimple_get_lhs ((*vec_stmt)->stmt);
7137
7138   vect_create_epilog_for_reduction (vect_defs, stmt_info, reduc_def_phi,
7139                                     epilog_copies, reduc_fn, phis,
7140                                     double_reduc, slp_node, slp_node_instance,
7141                                     cond_reduc_val, cond_reduc_op_code,
7142                                     neutral_op);
7143
7144   return true;
7145 }
7146
7147 /* Function vect_min_worthwhile_factor.
7148
7149    For a loop where we could vectorize the operation indicated by CODE,
7150    return the minimum vectorization factor that makes it worthwhile
7151    to use generic vectors.  */
7152 static unsigned int
7153 vect_min_worthwhile_factor (enum tree_code code)
7154 {
7155   switch (code)
7156     {
7157     case PLUS_EXPR:
7158     case MINUS_EXPR:
7159     case NEGATE_EXPR:
7160       return 4;
7161
7162     case BIT_AND_EXPR:
7163     case BIT_IOR_EXPR:
7164     case BIT_XOR_EXPR:
7165     case BIT_NOT_EXPR:
7166       return 2;
7167
7168     default:
7169       return INT_MAX;
7170     }
7171 }
7172
7173 /* Return true if VINFO indicates we are doing loop vectorization and if
7174    it is worth decomposing CODE operations into scalar operations for
7175    that loop's vectorization factor.  */
7176
7177 bool
7178 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
7179 {
7180   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7181   unsigned HOST_WIDE_INT value;
7182   return (loop_vinfo
7183           && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7184           && value >= vect_min_worthwhile_factor (code));
7185 }
7186
7187 /* Function vectorizable_induction
7188
7189    Check if STMT_INFO performs an induction computation that can be vectorized.
7190    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7191    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7192    Return true if STMT_INFO is vectorizable in this way.  */
7193
7194 bool
7195 vectorizable_induction (stmt_vec_info stmt_info,
7196                         gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7197                         stmt_vec_info *vec_stmt, slp_tree slp_node,
7198                         stmt_vector_for_cost *cost_vec)
7199 {
7200   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7201   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7202   unsigned ncopies;
7203   bool nested_in_vect_loop = false;
7204   struct loop *iv_loop;
7205   tree vec_def;
7206   edge pe = loop_preheader_edge (loop);
7207   basic_block new_bb;
7208   tree new_vec, vec_init, vec_step, t;
7209   tree new_name;
7210   gimple *new_stmt;
7211   gphi *induction_phi;
7212   tree induc_def, vec_dest;
7213   tree init_expr, step_expr;
7214   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7215   unsigned i;
7216   tree expr;
7217   gimple_seq stmts;
7218   imm_use_iterator imm_iter;
7219   use_operand_p use_p;
7220   gimple *exit_phi;
7221   edge latch_e;
7222   tree loop_arg;
7223   gimple_stmt_iterator si;
7224
7225   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
7226   if (!phi)
7227     return false;
7228
7229   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7230     return false;
7231
7232   /* Make sure it was recognized as induction computation.  */
7233   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7234     return false;
7235
7236   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7237   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7238
7239   if (slp_node)
7240     ncopies = 1;
7241   else
7242     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7243   gcc_assert (ncopies >= 1);
7244
7245   /* FORNOW. These restrictions should be relaxed.  */
7246   if (nested_in_vect_loop_p (loop, stmt_info))
7247     {
7248       imm_use_iterator imm_iter;
7249       use_operand_p use_p;
7250       gimple *exit_phi;
7251       edge latch_e;
7252       tree loop_arg;
7253
7254       if (ncopies > 1)
7255         {
7256           if (dump_enabled_p ())
7257             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7258                              "multiple types in nested loop.\n");
7259           return false;
7260         }
7261
7262       /* FORNOW: outer loop induction with SLP not supported.  */
7263       if (STMT_SLP_TYPE (stmt_info))
7264         return false;
7265
7266       exit_phi = NULL;
7267       latch_e = loop_latch_edge (loop->inner);
7268       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7269       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7270         {
7271           gimple *use_stmt = USE_STMT (use_p);
7272           if (is_gimple_debug (use_stmt))
7273             continue;
7274
7275           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7276             {
7277               exit_phi = use_stmt;
7278               break;
7279             }
7280         }
7281       if (exit_phi)
7282         {
7283           stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7284           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7285                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7286             {
7287               if (dump_enabled_p ())
7288                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7289                                  "inner-loop induction only used outside "
7290                                  "of the outer vectorized loop.\n");
7291               return false;
7292             }
7293         }
7294
7295       nested_in_vect_loop = true;
7296       iv_loop = loop->inner;
7297     }
7298   else
7299     iv_loop = loop;
7300   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7301
7302   if (slp_node && !nunits.is_constant ())
7303     {
7304       /* The current SLP code creates the initial value element-by-element.  */
7305       if (dump_enabled_p ())
7306         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7307                          "SLP induction not supported for variable-length"
7308                          " vectors.\n");
7309       return false;
7310     }
7311
7312   if (!vec_stmt) /* transformation not required.  */
7313     {
7314       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
7315       DUMP_VECT_SCOPE ("vectorizable_induction");
7316       vect_model_induction_cost (stmt_info, ncopies, cost_vec);
7317       return true;
7318     }
7319
7320   /* Transform.  */
7321
7322   /* Compute a vector variable, initialized with the first VF values of
7323      the induction variable.  E.g., for an iv with IV_PHI='X' and
7324      evolution S, for a vector of 4 units, we want to compute:
7325      [X, X + S, X + 2*S, X + 3*S].  */
7326
7327   if (dump_enabled_p ())
7328     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
7329
7330   latch_e = loop_latch_edge (iv_loop);
7331   loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7332
7333   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
7334   gcc_assert (step_expr != NULL_TREE);
7335
7336   pe = loop_preheader_edge (iv_loop);
7337   init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
7338                                      loop_preheader_edge (iv_loop));
7339
7340   stmts = NULL;
7341   if (!nested_in_vect_loop)
7342     {
7343       /* Convert the initial value to the desired type.  */
7344       tree new_type = TREE_TYPE (vectype);
7345       init_expr = gimple_convert (&stmts, new_type, init_expr);
7346
7347       /* If we are using the loop mask to "peel" for alignment then we need
7348          to adjust the start value here.  */
7349       tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
7350       if (skip_niters != NULL_TREE)
7351         {
7352           if (FLOAT_TYPE_P (vectype))
7353             skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
7354                                         skip_niters);
7355           else
7356             skip_niters = gimple_convert (&stmts, new_type, skip_niters);
7357           tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
7358                                          skip_niters, step_expr);
7359           init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
7360                                     init_expr, skip_step);
7361         }
7362     }
7363
7364   /* Convert the step to the desired type.  */
7365   step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
7366
7367   if (stmts)
7368     {
7369       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7370       gcc_assert (!new_bb);
7371     }
7372
7373   /* Find the first insertion point in the BB.  */
7374   basic_block bb = gimple_bb (phi);
7375   si = gsi_after_labels (bb);
7376
7377   /* For SLP induction we have to generate several IVs as for example
7378      with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
7379      [i + 2*S, i + 3*S, i + 3*S, i + 3*S].  The step is the same uniform
7380      [VF*S, VF*S, VF*S, VF*S] for all.  */
7381   if (slp_node)
7382     {
7383       /* Enforced above.  */
7384       unsigned int const_nunits = nunits.to_constant ();
7385
7386       /* Generate [VF*S, VF*S, ... ].  */
7387       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7388         {
7389           expr = build_int_cst (integer_type_node, vf);
7390           expr = fold_convert (TREE_TYPE (step_expr), expr);
7391         }
7392       else
7393         expr = build_int_cst (TREE_TYPE (step_expr), vf);
7394       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7395                               expr, step_expr);
7396       if (! CONSTANT_CLASS_P (new_name))
7397         new_name = vect_init_vector (stmt_info, new_name,
7398                                      TREE_TYPE (step_expr), NULL);
7399       new_vec = build_vector_from_val (vectype, new_name);
7400       vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7401
7402       /* Now generate the IVs.  */
7403       unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7404       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7405       unsigned elts = const_nunits * nvects;
7406       unsigned nivs = least_common_multiple (group_size,
7407                                              const_nunits) / const_nunits;
7408       gcc_assert (elts % group_size == 0);
7409       tree elt = init_expr;
7410       unsigned ivn;
7411       for (ivn = 0; ivn < nivs; ++ivn)
7412         {
7413           tree_vector_builder elts (vectype, const_nunits, 1);
7414           stmts = NULL;
7415           for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
7416             {
7417               if (ivn*const_nunits + eltn >= group_size
7418                   && (ivn * const_nunits + eltn) % group_size == 0)
7419                 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
7420                                     elt, step_expr);
7421               elts.quick_push (elt);
7422             }
7423           vec_init = gimple_build_vector (&stmts, &elts);
7424           if (stmts)
7425             {
7426               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7427               gcc_assert (!new_bb);
7428             }
7429
7430           /* Create the induction-phi that defines the induction-operand.  */
7431           vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7432           induction_phi = create_phi_node (vec_dest, iv_loop->header);
7433           stmt_vec_info induction_phi_info
7434             = loop_vinfo->add_stmt (induction_phi);
7435           induc_def = PHI_RESULT (induction_phi);
7436
7437           /* Create the iv update inside the loop  */
7438           vec_def = make_ssa_name (vec_dest);
7439           new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7440           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7441           loop_vinfo->add_stmt (new_stmt);
7442
7443           /* Set the arguments of the phi node:  */
7444           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7445           add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7446                        UNKNOWN_LOCATION);
7447
7448           SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi_info);
7449         }
7450
7451       /* Re-use IVs when we can.  */
7452       if (ivn < nvects)
7453         {
7454           unsigned vfp
7455             = least_common_multiple (group_size, const_nunits) / group_size;
7456           /* Generate [VF'*S, VF'*S, ... ].  */
7457           if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7458             {
7459               expr = build_int_cst (integer_type_node, vfp);
7460               expr = fold_convert (TREE_TYPE (step_expr), expr);
7461             }
7462           else
7463             expr = build_int_cst (TREE_TYPE (step_expr), vfp);
7464           new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7465                                   expr, step_expr);
7466           if (! CONSTANT_CLASS_P (new_name))
7467             new_name = vect_init_vector (stmt_info, new_name,
7468                                          TREE_TYPE (step_expr), NULL);
7469           new_vec = build_vector_from_val (vectype, new_name);
7470           vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7471           for (; ivn < nvects; ++ivn)
7472             {
7473               gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs]->stmt;
7474               tree def;
7475               if (gimple_code (iv) == GIMPLE_PHI)
7476                 def = gimple_phi_result (iv);
7477               else
7478                 def = gimple_assign_lhs (iv);
7479               new_stmt = gimple_build_assign (make_ssa_name (vectype),
7480                                               PLUS_EXPR,
7481                                               def, vec_step);
7482               if (gimple_code (iv) == GIMPLE_PHI)
7483                 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7484               else
7485                 {
7486                   gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
7487                   gsi_insert_after (&tgsi, new_stmt, GSI_CONTINUE_LINKING);
7488                 }
7489               SLP_TREE_VEC_STMTS (slp_node).quick_push
7490                 (loop_vinfo->add_stmt (new_stmt));
7491             }
7492         }
7493
7494       return true;
7495     }
7496
7497   /* Create the vector that holds the initial_value of the induction.  */
7498   if (nested_in_vect_loop)
7499     {
7500       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
7501          been created during vectorization of previous stmts.  We obtain it
7502          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
7503       vec_init = vect_get_vec_def_for_operand (init_expr, stmt_info);
7504       /* If the initial value is not of proper type, convert it.  */
7505       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
7506         {
7507           new_stmt
7508             = gimple_build_assign (vect_get_new_ssa_name (vectype,
7509                                                           vect_simple_var,
7510                                                           "vec_iv_"),
7511                                    VIEW_CONVERT_EXPR,
7512                                    build1 (VIEW_CONVERT_EXPR, vectype,
7513                                            vec_init));
7514           vec_init = gimple_assign_lhs (new_stmt);
7515           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
7516                                                  new_stmt);
7517           gcc_assert (!new_bb);
7518           loop_vinfo->add_stmt (new_stmt);
7519         }
7520     }
7521   else
7522     {
7523       /* iv_loop is the loop to be vectorized. Create:
7524          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
7525       stmts = NULL;
7526       new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
7527
7528       unsigned HOST_WIDE_INT const_nunits;
7529       if (nunits.is_constant (&const_nunits))
7530         {
7531           tree_vector_builder elts (vectype, const_nunits, 1);
7532           elts.quick_push (new_name);
7533           for (i = 1; i < const_nunits; i++)
7534             {
7535               /* Create: new_name_i = new_name + step_expr  */
7536               new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
7537                                        new_name, step_expr);
7538               elts.quick_push (new_name);
7539             }
7540           /* Create a vector from [new_name_0, new_name_1, ...,
7541              new_name_nunits-1]  */
7542           vec_init = gimple_build_vector (&stmts, &elts);
7543         }
7544       else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
7545         /* Build the initial value directly from a VEC_SERIES_EXPR.  */
7546         vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, vectype,
7547                                  new_name, step_expr);
7548       else
7549         {
7550           /* Build:
7551                 [base, base, base, ...]
7552                 + (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
7553           gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
7554           gcc_assert (flag_associative_math);
7555           tree index = build_index_vector (vectype, 0, 1);
7556           tree base_vec = gimple_build_vector_from_val (&stmts, vectype,
7557                                                         new_name);
7558           tree step_vec = gimple_build_vector_from_val (&stmts, vectype,
7559                                                         step_expr);
7560           vec_init = gimple_build (&stmts, FLOAT_EXPR, vectype, index);
7561           vec_init = gimple_build (&stmts, MULT_EXPR, vectype,
7562                                    vec_init, step_vec);
7563           vec_init = gimple_build (&stmts, PLUS_EXPR, vectype,
7564                                    vec_init, base_vec);
7565         }
7566
7567       if (stmts)
7568         {
7569           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7570           gcc_assert (!new_bb);
7571         }
7572     }
7573
7574
7575   /* Create the vector that holds the step of the induction.  */
7576   if (nested_in_vect_loop)
7577     /* iv_loop is nested in the loop to be vectorized. Generate:
7578        vec_step = [S, S, S, S]  */
7579     new_name = step_expr;
7580   else
7581     {
7582       /* iv_loop is the loop to be vectorized. Generate:
7583           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
7584       gimple_seq seq = NULL;
7585       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7586         {
7587           expr = build_int_cst (integer_type_node, vf);
7588           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7589         }
7590       else
7591         expr = build_int_cst (TREE_TYPE (step_expr), vf);
7592       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7593                                expr, step_expr);
7594       if (seq)
7595         {
7596           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7597           gcc_assert (!new_bb);
7598         }
7599     }
7600
7601   t = unshare_expr (new_name);
7602   gcc_assert (CONSTANT_CLASS_P (new_name)
7603               || TREE_CODE (new_name) == SSA_NAME);
7604   new_vec = build_vector_from_val (vectype, t);
7605   vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7606
7607
7608   /* Create the following def-use cycle:
7609      loop prolog:
7610          vec_init = ...
7611          vec_step = ...
7612      loop:
7613          vec_iv = PHI <vec_init, vec_loop>
7614          ...
7615          STMT
7616          ...
7617          vec_loop = vec_iv + vec_step;  */
7618
7619   /* Create the induction-phi that defines the induction-operand.  */
7620   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7621   induction_phi = create_phi_node (vec_dest, iv_loop->header);
7622   stmt_vec_info induction_phi_info = loop_vinfo->add_stmt (induction_phi);
7623   induc_def = PHI_RESULT (induction_phi);
7624
7625   /* Create the iv update inside the loop  */
7626   vec_def = make_ssa_name (vec_dest);
7627   new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7628   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7629   stmt_vec_info new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7630
7631   /* Set the arguments of the phi node:  */
7632   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7633   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7634                UNKNOWN_LOCATION);
7635
7636   STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi_info;
7637
7638   /* In case that vectorization factor (VF) is bigger than the number
7639      of elements that we can fit in a vectype (nunits), we have to generate
7640      more than one vector stmt - i.e - we need to "unroll" the
7641      vector stmt by a factor VF/nunits.  For more details see documentation
7642      in vectorizable_operation.  */
7643
7644   if (ncopies > 1)
7645     {
7646       gimple_seq seq = NULL;
7647       stmt_vec_info prev_stmt_vinfo;
7648       /* FORNOW. This restriction should be relaxed.  */
7649       gcc_assert (!nested_in_vect_loop);
7650
7651       /* Create the vector that holds the step of the induction.  */
7652       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7653         {
7654           expr = build_int_cst (integer_type_node, nunits);
7655           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7656         }
7657       else
7658         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
7659       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7660                                expr, step_expr);
7661       if (seq)
7662         {
7663           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7664           gcc_assert (!new_bb);
7665         }
7666
7667       t = unshare_expr (new_name);
7668       gcc_assert (CONSTANT_CLASS_P (new_name)
7669                   || TREE_CODE (new_name) == SSA_NAME);
7670       new_vec = build_vector_from_val (vectype, t);
7671       vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7672
7673       vec_def = induc_def;
7674       prev_stmt_vinfo = induction_phi_info;
7675       for (i = 1; i < ncopies; i++)
7676         {
7677           /* vec_i = vec_prev + vec_step  */
7678           new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
7679                                           vec_def, vec_step);
7680           vec_def = make_ssa_name (vec_dest, new_stmt);
7681           gimple_assign_set_lhs (new_stmt, vec_def);
7682
7683           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7684           new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7685           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt_info;
7686           prev_stmt_vinfo = new_stmt_info;
7687         }
7688     }
7689
7690   if (nested_in_vect_loop)
7691     {
7692       /* Find the loop-closed exit-phi of the induction, and record
7693          the final vector of induction results:  */
7694       exit_phi = NULL;
7695       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7696         {
7697           gimple *use_stmt = USE_STMT (use_p);
7698           if (is_gimple_debug (use_stmt))
7699             continue;
7700
7701           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
7702             {
7703               exit_phi = use_stmt;
7704               break;
7705             }
7706         }
7707       if (exit_phi)
7708         {
7709           stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7710           /* FORNOW. Currently not supporting the case that an inner-loop induction
7711              is not used in the outer-loop (i.e. only outside the outer-loop).  */
7712           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
7713                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
7714
7715           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt_info;
7716           if (dump_enabled_p ())
7717             dump_printf_loc (MSG_NOTE, vect_location,
7718                              "vector of inductions after inner-loop:%G",
7719                              new_stmt);
7720         }
7721     }
7722
7723
7724   if (dump_enabled_p ())
7725     dump_printf_loc (MSG_NOTE, vect_location,
7726                      "transform induction: created def-use cycle: %G%G",
7727                      induction_phi, SSA_NAME_DEF_STMT (vec_def));
7728
7729   return true;
7730 }
7731
7732 /* Function vectorizable_live_operation.
7733
7734    STMT_INFO computes a value that is used outside the loop.  Check if
7735    it can be supported.  */
7736
7737 bool
7738 vectorizable_live_operation (stmt_vec_info stmt_info,
7739                              gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7740                              slp_tree slp_node, int slp_index,
7741                              stmt_vec_info *vec_stmt,
7742                              stmt_vector_for_cost *)
7743 {
7744   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7745   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7746   imm_use_iterator imm_iter;
7747   tree lhs, lhs_type, bitsize, vec_bitsize;
7748   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7749   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7750   int ncopies;
7751   gimple *use_stmt;
7752   auto_vec<tree> vec_oprnds;
7753   int vec_entry = 0;
7754   poly_uint64 vec_index = 0;
7755
7756   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
7757
7758   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
7759     return false;
7760
7761   /* FORNOW.  CHECKME.  */
7762   if (nested_in_vect_loop_p (loop, stmt_info))
7763     return false;
7764
7765   /* If STMT is not relevant and it is a simple assignment and its inputs are
7766      invariant then it can remain in place, unvectorized.  The original last
7767      scalar value that it computes will be used.  */
7768   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7769     {
7770       gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
7771       if (dump_enabled_p ())
7772         dump_printf_loc (MSG_NOTE, vect_location,
7773                          "statement is simple and uses invariant.  Leaving in "
7774                          "place.\n");
7775       return true;
7776     }
7777
7778   if (slp_node)
7779     ncopies = 1;
7780   else
7781     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7782
7783   if (slp_node)
7784     {
7785       gcc_assert (slp_index >= 0);
7786
7787       int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7788       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7789
7790       /* Get the last occurrence of the scalar index from the concatenation of
7791          all the slp vectors. Calculate which slp vector it is and the index
7792          within.  */
7793       poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
7794
7795       /* Calculate which vector contains the result, and which lane of
7796          that vector we need.  */
7797       if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
7798         {
7799           if (dump_enabled_p ())
7800             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7801                              "Cannot determine which vector holds the"
7802                              " final result.\n");
7803           return false;
7804         }
7805     }
7806
7807   if (!vec_stmt)
7808     {
7809       /* No transformation required.  */
7810       if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
7811         {
7812           if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
7813                                                OPTIMIZE_FOR_SPEED))
7814             {
7815               if (dump_enabled_p ())
7816                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7817                                  "can't use a fully-masked loop because "
7818                                  "the target doesn't support extract last "
7819                                  "reduction.\n");
7820               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7821             }
7822           else if (slp_node)
7823             {
7824               if (dump_enabled_p ())
7825                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7826                                  "can't use a fully-masked loop because an "
7827                                  "SLP statement is live after the loop.\n");
7828               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7829             }
7830           else if (ncopies > 1)
7831             {
7832               if (dump_enabled_p ())
7833                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7834                                  "can't use a fully-masked loop because"
7835                                  " ncopies is greater than 1.\n");
7836               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7837             }
7838           else
7839             {
7840               gcc_assert (ncopies == 1 && !slp_node);
7841               vect_record_loop_mask (loop_vinfo,
7842                                      &LOOP_VINFO_MASKS (loop_vinfo),
7843                                      1, vectype);
7844             }
7845         }
7846       return true;
7847     }
7848
7849   /* Use the lhs of the original scalar statement.  */
7850   gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
7851
7852   lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
7853         : gimple_get_lhs (stmt);
7854   lhs_type = TREE_TYPE (lhs);
7855
7856   bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype)
7857              ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype)))
7858              : TYPE_SIZE (TREE_TYPE (vectype)));
7859   vec_bitsize = TYPE_SIZE (vectype);
7860
7861   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
7862   tree vec_lhs, bitstart;
7863   if (slp_node)
7864     {
7865       gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
7866
7867       /* Get the correct slp vectorized stmt.  */
7868       gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry]->stmt;
7869       if (gphi *phi = dyn_cast <gphi *> (vec_stmt))
7870         vec_lhs = gimple_phi_result (phi);
7871       else
7872         vec_lhs = gimple_get_lhs (vec_stmt);
7873
7874       /* Get entry to use.  */
7875       bitstart = bitsize_int (vec_index);
7876       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
7877     }
7878   else
7879     {
7880       enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
7881       vec_lhs = vect_get_vec_def_for_operand_1 (stmt_info, dt);
7882       gcc_checking_assert (ncopies == 1
7883                            || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
7884
7885       /* For multiple copies, get the last copy.  */
7886       for (int i = 1; i < ncopies; ++i)
7887         vec_lhs = vect_get_vec_def_for_stmt_copy (loop_vinfo, vec_lhs);
7888
7889       /* Get the last lane in the vector.  */
7890       bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
7891     }
7892
7893   gimple_seq stmts = NULL;
7894   tree new_tree;
7895   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7896     {
7897       /* Emit:
7898
7899            SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
7900
7901          where VEC_LHS is the vectorized live-out result and MASK is
7902          the loop mask for the final iteration.  */
7903       gcc_assert (ncopies == 1 && !slp_node);
7904       tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
7905       tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
7906                                       1, vectype, 0);
7907       tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST,
7908                                       scalar_type, mask, vec_lhs);
7909
7910       /* Convert the extracted vector element to the required scalar type.  */
7911       new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
7912     }
7913   else
7914     {
7915       tree bftype = TREE_TYPE (vectype);
7916       if (VECTOR_BOOLEAN_TYPE_P (vectype))
7917         bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
7918       new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs, bitsize, bitstart);
7919       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
7920                                        &stmts, true, NULL_TREE);
7921     }
7922
7923   if (stmts)
7924     gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts);
7925
7926   /* Replace use of lhs with newly computed result.  If the use stmt is a
7927      single arg PHI, just replace all uses of PHI result.  It's necessary
7928      because lcssa PHI defining lhs may be before newly inserted stmt.  */
7929   use_operand_p use_p;
7930   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
7931     if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
7932         && !is_gimple_debug (use_stmt))
7933     {
7934       if (gimple_code (use_stmt) == GIMPLE_PHI
7935           && gimple_phi_num_args (use_stmt) == 1)
7936         {
7937           replace_uses_by (gimple_phi_result (use_stmt), new_tree);
7938         }
7939       else
7940         {
7941           FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
7942             SET_USE (use_p, new_tree);
7943         }
7944       update_stmt (use_stmt);
7945     }
7946
7947   return true;
7948 }
7949
7950 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO.  */
7951
7952 static void
7953 vect_loop_kill_debug_uses (struct loop *loop, stmt_vec_info stmt_info)
7954 {
7955   ssa_op_iter op_iter;
7956   imm_use_iterator imm_iter;
7957   def_operand_p def_p;
7958   gimple *ustmt;
7959
7960   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
7961     {
7962       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
7963         {
7964           basic_block bb;
7965
7966           if (!is_gimple_debug (ustmt))
7967             continue;
7968
7969           bb = gimple_bb (ustmt);
7970
7971           if (!flow_bb_inside_loop_p (loop, bb))
7972             {
7973               if (gimple_debug_bind_p (ustmt))
7974                 {
7975                   if (dump_enabled_p ())
7976                     dump_printf_loc (MSG_NOTE, vect_location,
7977                                      "killing debug use\n");
7978
7979                   gimple_debug_bind_reset_value (ustmt);
7980                   update_stmt (ustmt);
7981                 }
7982               else
7983                 gcc_unreachable ();
7984             }
7985         }
7986     }
7987 }
7988
7989 /* Given loop represented by LOOP_VINFO, return true if computation of
7990    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
7991    otherwise.  */
7992
7993 static bool
7994 loop_niters_no_overflow (loop_vec_info loop_vinfo)
7995 {
7996   /* Constant case.  */
7997   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
7998     {
7999       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
8000       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
8001
8002       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
8003       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
8004       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
8005         return true;
8006     }
8007
8008   widest_int max;
8009   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8010   /* Check the upper bound of loop niters.  */
8011   if (get_max_loop_iterations (loop, &max))
8012     {
8013       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
8014       signop sgn = TYPE_SIGN (type);
8015       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
8016       if (max < type_max)
8017         return true;
8018     }
8019   return false;
8020 }
8021
8022 /* Return a mask type with half the number of elements as TYPE.  */
8023
8024 tree
8025 vect_halve_mask_nunits (tree type)
8026 {
8027   poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (type), 2);
8028   return build_truth_vector_type (nunits, current_vector_size);
8029 }
8030
8031 /* Return a mask type with twice as many elements as TYPE.  */
8032
8033 tree
8034 vect_double_mask_nunits (tree type)
8035 {
8036   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (type) * 2;
8037   return build_truth_vector_type (nunits, current_vector_size);
8038 }
8039
8040 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8041    contain a sequence of NVECTORS masks that each control a vector of type
8042    VECTYPE.  */
8043
8044 void
8045 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
8046                        unsigned int nvectors, tree vectype)
8047 {
8048   gcc_assert (nvectors != 0);
8049   if (masks->length () < nvectors)
8050     masks->safe_grow_cleared (nvectors);
8051   rgroup_masks *rgm = &(*masks)[nvectors - 1];
8052   /* The number of scalars per iteration and the number of vectors are
8053      both compile-time constants.  */
8054   unsigned int nscalars_per_iter
8055     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8056                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8057   if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8058     {
8059       rgm->max_nscalars_per_iter = nscalars_per_iter;
8060       rgm->mask_type = build_same_sized_truth_vector_type (vectype);
8061     }
8062 }
8063
8064 /* Given a complete set of masks MASKS, extract mask number INDEX
8065    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8066    where 0 <= INDEX < NVECTORS.  Insert any set-up statements before GSI.
8067
8068    See the comment above vec_loop_masks for more details about the mask
8069    arrangement.  */
8070
8071 tree
8072 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
8073                     unsigned int nvectors, tree vectype, unsigned int index)
8074 {
8075   rgroup_masks *rgm = &(*masks)[nvectors - 1];
8076   tree mask_type = rgm->mask_type;
8077
8078   /* Populate the rgroup's mask array, if this is the first time we've
8079      used it.  */
8080   if (rgm->masks.is_empty ())
8081     {
8082       rgm->masks.safe_grow_cleared (nvectors);
8083       for (unsigned int i = 0; i < nvectors; ++i)
8084         {
8085           tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
8086           /* Provide a dummy definition until the real one is available.  */
8087           SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
8088           rgm->masks[i] = mask;
8089         }
8090     }
8091
8092   tree mask = rgm->masks[index];
8093   if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
8094                 TYPE_VECTOR_SUBPARTS (vectype)))
8095     {
8096       /* A loop mask for data type X can be reused for data type Y
8097          if X has N times more elements than Y and if Y's elements
8098          are N times bigger than X's.  In this case each sequence
8099          of N elements in the loop mask will be all-zero or all-one.
8100          We can then view-convert the mask so that each sequence of
8101          N elements is replaced by a single element.  */
8102       gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
8103                               TYPE_VECTOR_SUBPARTS (vectype)));
8104       gimple_seq seq = NULL;
8105       mask_type = build_same_sized_truth_vector_type (vectype);
8106       mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
8107       if (seq)
8108         gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
8109     }
8110   return mask;
8111 }
8112
8113 /* Scale profiling counters by estimation for LOOP which is vectorized
8114    by factor VF.  */
8115
8116 static void
8117 scale_profile_for_vect_loop (struct loop *loop, unsigned vf)
8118 {
8119   edge preheader = loop_preheader_edge (loop);
8120   /* Reduce loop iterations by the vectorization factor.  */
8121   gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
8122   profile_count freq_h = loop->header->count, freq_e = preheader->count ();
8123
8124   if (freq_h.nonzero_p ())
8125     {
8126       profile_probability p;
8127
8128       /* Avoid dropping loop body profile counter to 0 because of zero count
8129          in loop's preheader.  */
8130       if (!(freq_e == profile_count::zero ()))
8131         freq_e = freq_e.force_nonzero ();
8132       p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
8133       scale_loop_frequencies (loop, p);
8134     }
8135
8136   edge exit_e = single_exit (loop);
8137   exit_e->probability = profile_probability::always ()
8138                                  .apply_scale (1, new_est_niter + 1);
8139
8140   edge exit_l = single_pred_edge (loop->latch);
8141   profile_probability prob = exit_l->probability;
8142   exit_l->probability = exit_e->probability.invert ();
8143   if (prob.initialized_p () && exit_l->probability.initialized_p ())
8144     scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
8145 }
8146
8147 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
8148    When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
8149    stmt_vec_info.  */
8150
8151 static void
8152 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8153                           gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
8154 {
8155   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8156   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8157
8158   if (dump_enabled_p ())
8159     dump_printf_loc (MSG_NOTE, vect_location,
8160                      "------>vectorizing statement: %G", stmt_info->stmt);
8161
8162   if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8163     vect_loop_kill_debug_uses (loop, stmt_info);
8164
8165   if (!STMT_VINFO_RELEVANT_P (stmt_info)
8166       && !STMT_VINFO_LIVE_P (stmt_info))
8167     return;
8168
8169   if (STMT_VINFO_VECTYPE (stmt_info))
8170     {
8171       poly_uint64 nunits
8172         = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8173       if (!STMT_SLP_TYPE (stmt_info)
8174           && maybe_ne (nunits, vf)
8175           && dump_enabled_p ())
8176         /* For SLP VF is set according to unrolling factor, and not
8177            to vector size, hence for SLP this print is not valid.  */
8178         dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8179     }
8180
8181   /* Pure SLP statements have already been vectorized.  We still need
8182      to apply loop vectorization to hybrid SLP statements.  */
8183   if (PURE_SLP_STMT (stmt_info))
8184     return;
8185
8186   if (dump_enabled_p ())
8187     dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
8188
8189   if (vect_transform_stmt (stmt_info, gsi, NULL, NULL))
8190     *seen_store = stmt_info;
8191 }
8192
8193 /* Function vect_transform_loop.
8194
8195    The analysis phase has determined that the loop is vectorizable.
8196    Vectorize the loop - created vectorized stmts to replace the scalar
8197    stmts in the loop, and update the loop exit condition.
8198    Returns scalar epilogue loop if any.  */
8199
8200 struct loop *
8201 vect_transform_loop (loop_vec_info loop_vinfo)
8202 {
8203   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8204   struct loop *epilogue = NULL;
8205   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8206   int nbbs = loop->num_nodes;
8207   int i;
8208   tree niters_vector = NULL_TREE;
8209   tree step_vector = NULL_TREE;
8210   tree niters_vector_mult_vf = NULL_TREE;
8211   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8212   unsigned int lowest_vf = constant_lower_bound (vf);
8213   gimple *stmt;
8214   bool check_profitability = false;
8215   unsigned int th;
8216
8217   DUMP_VECT_SCOPE ("vec_transform_loop");
8218
8219   loop_vinfo->shared->check_datarefs ();
8220
8221   /* Use the more conservative vectorization threshold.  If the number
8222      of iterations is constant assume the cost check has been performed
8223      by our caller.  If the threshold makes all loops profitable that
8224      run at least the (estimated) vectorization factor number of times
8225      checking is pointless, too.  */
8226   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
8227   if (th >= vect_vf_for_cost (loop_vinfo)
8228       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8229     {
8230       if (dump_enabled_p ())
8231         dump_printf_loc (MSG_NOTE, vect_location,
8232                          "Profitability threshold is %d loop iterations.\n",
8233                          th);
8234       check_profitability = true;
8235     }
8236
8237   /* Make sure there exists a single-predecessor exit bb.  Do this before
8238      versioning.   */
8239   edge e = single_exit (loop);
8240   if (! single_pred_p (e->dest))
8241     {
8242       split_loop_exit_edge (e, true);
8243       if (dump_enabled_p ())
8244         dump_printf (MSG_NOTE, "split exit edge\n");
8245     }
8246
8247   /* Version the loop first, if required, so the profitability check
8248      comes first.  */
8249
8250   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
8251     {
8252       poly_uint64 versioning_threshold
8253         = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
8254       if (check_profitability
8255           && ordered_p (poly_uint64 (th), versioning_threshold))
8256         {
8257           versioning_threshold = ordered_max (poly_uint64 (th),
8258                                               versioning_threshold);
8259           check_profitability = false;
8260         }
8261       vect_loop_versioning (loop_vinfo, th, check_profitability,
8262                             versioning_threshold);
8263       check_profitability = false;
8264     }
8265
8266   /* Make sure there exists a single-predecessor exit bb also on the
8267      scalar loop copy.  Do this after versioning but before peeling
8268      so CFG structure is fine for both scalar and if-converted loop
8269      to make slpeel_duplicate_current_defs_from_edges face matched
8270      loop closed PHI nodes on the exit.  */
8271   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8272     {
8273       e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
8274       if (! single_pred_p (e->dest))
8275         {
8276           split_loop_exit_edge (e, true);
8277           if (dump_enabled_p ())
8278             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
8279         }
8280     }
8281
8282   tree niters = vect_build_loop_niters (loop_vinfo);
8283   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
8284   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
8285   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
8286   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
8287                               &step_vector, &niters_vector_mult_vf, th,
8288                               check_profitability, niters_no_overflow);
8289
8290   if (niters_vector == NULL_TREE)
8291     {
8292       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8293           && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8294           && known_eq (lowest_vf, vf))
8295         {
8296           niters_vector
8297             = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
8298                              LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
8299           step_vector = build_one_cst (TREE_TYPE (niters));
8300         }
8301       else
8302         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
8303                                      &step_vector, niters_no_overflow);
8304     }
8305
8306   /* 1) Make sure the loop header has exactly two entries
8307      2) Make sure we have a preheader basic block.  */
8308
8309   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
8310
8311   split_edge (loop_preheader_edge (loop));
8312
8313   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8314       && vect_use_loop_mask_for_alignment_p (loop_vinfo))
8315     /* This will deal with any possible peeling.  */
8316     vect_prepare_for_masked_peels (loop_vinfo);
8317
8318   /* Schedule the SLP instances first, then handle loop vectorization
8319      below.  */
8320   if (!loop_vinfo->slp_instances.is_empty ())
8321     {
8322       DUMP_VECT_SCOPE ("scheduling SLP instances");
8323       vect_schedule_slp (loop_vinfo);
8324     }
8325
8326   /* FORNOW: the vectorizer supports only loops which body consist
8327      of one basic block (header + empty latch). When the vectorizer will
8328      support more involved loop forms, the order by which the BBs are
8329      traversed need to be reconsidered.  */
8330
8331   for (i = 0; i < nbbs; i++)
8332     {
8333       basic_block bb = bbs[i];
8334       stmt_vec_info stmt_info;
8335
8336       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
8337            gsi_next (&si))
8338         {
8339           gphi *phi = si.phi ();
8340           if (dump_enabled_p ())
8341             dump_printf_loc (MSG_NOTE, vect_location,
8342                              "------>vectorizing phi: %G", phi);
8343           stmt_info = loop_vinfo->lookup_stmt (phi);
8344           if (!stmt_info)
8345             continue;
8346
8347           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8348             vect_loop_kill_debug_uses (loop, stmt_info);
8349
8350           if (!STMT_VINFO_RELEVANT_P (stmt_info)
8351               && !STMT_VINFO_LIVE_P (stmt_info))
8352             continue;
8353
8354           if (STMT_VINFO_VECTYPE (stmt_info)
8355               && (maybe_ne
8356                   (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
8357               && dump_enabled_p ())
8358             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8359
8360           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
8361                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8362                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
8363               && ! PURE_SLP_STMT (stmt_info))
8364             {
8365               if (dump_enabled_p ())
8366                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
8367               vect_transform_stmt (stmt_info, NULL, NULL, NULL);
8368             }
8369         }
8370
8371       for (gimple_stmt_iterator si = gsi_start_bb (bb);
8372            !gsi_end_p (si);)
8373         {
8374           stmt = gsi_stmt (si);
8375           /* During vectorization remove existing clobber stmts.  */
8376           if (gimple_clobber_p (stmt))
8377             {
8378               unlink_stmt_vdef (stmt);
8379               gsi_remove (&si, true);
8380               release_defs (stmt);
8381             }
8382           else
8383             {
8384               stmt_info = loop_vinfo->lookup_stmt (stmt);
8385
8386               /* vector stmts created in the outer-loop during vectorization of
8387                  stmts in an inner-loop may not have a stmt_info, and do not
8388                  need to be vectorized.  */
8389               stmt_vec_info seen_store = NULL;
8390               if (stmt_info)
8391                 {
8392                   if (STMT_VINFO_IN_PATTERN_P (stmt_info))
8393                     {
8394                       gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
8395                       for (gimple_stmt_iterator subsi = gsi_start (def_seq);
8396                            !gsi_end_p (subsi); gsi_next (&subsi))
8397                         {
8398                           stmt_vec_info pat_stmt_info
8399                             = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
8400                           vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
8401                                                     &si, &seen_store);
8402                         }
8403                       stmt_vec_info pat_stmt_info
8404                         = STMT_VINFO_RELATED_STMT (stmt_info);
8405                       vect_transform_loop_stmt (loop_vinfo, pat_stmt_info, &si,
8406                                                 &seen_store);
8407                     }
8408                   vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
8409                                             &seen_store);
8410                 }
8411               gsi_next (&si);
8412               if (seen_store)
8413                 {
8414                   if (STMT_VINFO_GROUPED_ACCESS (seen_store))
8415                     /* Interleaving.  If IS_STORE is TRUE, the
8416                        vectorization of the interleaving chain was
8417                        completed - free all the stores in the chain.  */
8418                     vect_remove_stores (DR_GROUP_FIRST_ELEMENT (seen_store));
8419                   else
8420                     /* Free the attached stmt_vec_info and remove the stmt.  */
8421                     loop_vinfo->remove_stmt (stmt_info);
8422                 }
8423             }
8424         }
8425
8426       /* Stub out scalar statements that must not survive vectorization.
8427          Doing this here helps with grouped statements, or statements that
8428          are involved in patterns.  */
8429       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
8430            !gsi_end_p (gsi); gsi_next (&gsi))
8431         {
8432           gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
8433           if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
8434             {
8435               tree lhs = gimple_get_lhs (call);
8436               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8437                 {
8438                   tree zero = build_zero_cst (TREE_TYPE (lhs));
8439                   gimple *new_stmt = gimple_build_assign (lhs, zero);
8440                   gsi_replace (&gsi, new_stmt, true);
8441                 }
8442             }
8443         }
8444     }                           /* BBs in loop */
8445
8446   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
8447      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
8448   if (integer_onep (step_vector))
8449     niters_no_overflow = true;
8450   vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
8451                            niters_vector_mult_vf, !niters_no_overflow);
8452
8453   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
8454   scale_profile_for_vect_loop (loop, assumed_vf);
8455
8456   /* True if the final iteration might not handle a full vector's
8457      worth of scalar iterations.  */
8458   bool final_iter_may_be_partial = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8459   /* The minimum number of iterations performed by the epilogue.  This
8460      is 1 when peeling for gaps because we always need a final scalar
8461      iteration.  */
8462   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
8463   /* +1 to convert latch counts to loop iteration counts,
8464      -min_epilogue_iters to remove iterations that cannot be performed
8465        by the vector code.  */
8466   int bias_for_lowest = 1 - min_epilogue_iters;
8467   int bias_for_assumed = bias_for_lowest;
8468   int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
8469   if (alignment_npeels && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8470     {
8471       /* When the amount of peeling is known at compile time, the first
8472          iteration will have exactly alignment_npeels active elements.
8473          In the worst case it will have at least one.  */
8474       int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
8475       bias_for_lowest += lowest_vf - min_first_active;
8476       bias_for_assumed += assumed_vf - min_first_active;
8477     }
8478   /* In these calculations the "- 1" converts loop iteration counts
8479      back to latch counts.  */
8480   if (loop->any_upper_bound)
8481     loop->nb_iterations_upper_bound
8482       = (final_iter_may_be_partial
8483          ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
8484                           lowest_vf) - 1
8485          : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
8486                            lowest_vf) - 1);
8487   if (loop->any_likely_upper_bound)
8488     loop->nb_iterations_likely_upper_bound
8489       = (final_iter_may_be_partial
8490          ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
8491                           + bias_for_lowest, lowest_vf) - 1
8492          : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
8493                            + bias_for_lowest, lowest_vf) - 1);
8494   if (loop->any_estimate)
8495     loop->nb_iterations_estimate
8496       = (final_iter_may_be_partial
8497          ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
8498                           assumed_vf) - 1
8499          : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
8500                            assumed_vf) - 1);
8501
8502   if (dump_enabled_p ())
8503     {
8504       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8505         {
8506           dump_printf_loc (MSG_NOTE, vect_location,
8507                            "LOOP VECTORIZED\n");
8508           if (loop->inner)
8509             dump_printf_loc (MSG_NOTE, vect_location,
8510                              "OUTER LOOP VECTORIZED\n");
8511           dump_printf (MSG_NOTE, "\n");
8512         }
8513       else
8514         {
8515           dump_printf_loc (MSG_NOTE, vect_location,
8516                            "LOOP EPILOGUE VECTORIZED (VS=");
8517           dump_dec (MSG_NOTE, current_vector_size);
8518           dump_printf (MSG_NOTE, ")\n");
8519         }
8520     }
8521
8522   /* Free SLP instances here because otherwise stmt reference counting
8523      won't work.  */
8524   slp_instance instance;
8525   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
8526     vect_free_slp_instance (instance, true);
8527   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
8528   /* Clear-up safelen field since its value is invalid after vectorization
8529      since vectorized loop can have loop-carried dependencies.  */
8530   loop->safelen = 0;
8531
8532   /* Don't vectorize epilogue for epilogue.  */
8533   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8534     epilogue = NULL;
8535
8536   if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
8537     epilogue = NULL;
8538
8539   if (epilogue)
8540     {
8541       auto_vector_sizes vector_sizes;
8542       targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
8543       unsigned int next_size = 0;
8544
8545       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8546           && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0
8547           && known_eq (vf, lowest_vf))
8548         {
8549           unsigned int eiters
8550             = (LOOP_VINFO_INT_NITERS (loop_vinfo)
8551                - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
8552           eiters = eiters % lowest_vf;
8553           epilogue->nb_iterations_upper_bound = eiters - 1;
8554
8555           unsigned int ratio;
8556           while (next_size < vector_sizes.length ()
8557                  && !(constant_multiple_p (current_vector_size,
8558                                            vector_sizes[next_size], &ratio)
8559                       && eiters >= lowest_vf / ratio))
8560             next_size += 1;
8561         }
8562       else
8563         while (next_size < vector_sizes.length ()
8564                && maybe_lt (current_vector_size, vector_sizes[next_size]))
8565           next_size += 1;
8566
8567       if (next_size == vector_sizes.length ())
8568         epilogue = NULL;
8569     }
8570
8571   if (epilogue)
8572     {
8573       epilogue->force_vectorize = loop->force_vectorize;
8574       epilogue->safelen = loop->safelen;
8575       epilogue->dont_vectorize = false;
8576
8577       /* We may need to if-convert epilogue to vectorize it.  */
8578       if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8579         tree_if_conversion (epilogue);
8580     }
8581
8582   return epilogue;
8583 }
8584
8585 /* The code below is trying to perform simple optimization - revert
8586    if-conversion for masked stores, i.e. if the mask of a store is zero
8587    do not perform it and all stored value producers also if possible.
8588    For example,
8589      for (i=0; i<n; i++)
8590        if (c[i])
8591         {
8592           p1[i] += 1;
8593           p2[i] = p3[i] +2;
8594         }
8595    this transformation will produce the following semi-hammock:
8596
8597    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
8598      {
8599        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
8600        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
8601        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
8602        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
8603        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
8604        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
8605      }
8606 */
8607
8608 void
8609 optimize_mask_stores (struct loop *loop)
8610 {
8611   basic_block *bbs = get_loop_body (loop);
8612   unsigned nbbs = loop->num_nodes;
8613   unsigned i;
8614   basic_block bb;
8615   struct loop *bb_loop;
8616   gimple_stmt_iterator gsi;
8617   gimple *stmt;
8618   auto_vec<gimple *> worklist;
8619
8620   vect_location = find_loop_location (loop);
8621   /* Pick up all masked stores in loop if any.  */
8622   for (i = 0; i < nbbs; i++)
8623     {
8624       bb = bbs[i];
8625       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
8626            gsi_next (&gsi))
8627         {
8628           stmt = gsi_stmt (gsi);
8629           if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
8630             worklist.safe_push (stmt);
8631         }
8632     }
8633
8634   free (bbs);
8635   if (worklist.is_empty ())
8636     return;
8637
8638   /* Loop has masked stores.  */
8639   while (!worklist.is_empty ())
8640     {
8641       gimple *last, *last_store;
8642       edge e, efalse;
8643       tree mask;
8644       basic_block store_bb, join_bb;
8645       gimple_stmt_iterator gsi_to;
8646       tree vdef, new_vdef;
8647       gphi *phi;
8648       tree vectype;
8649       tree zero;
8650
8651       last = worklist.pop ();
8652       mask = gimple_call_arg (last, 2);
8653       bb = gimple_bb (last);
8654       /* Create then_bb and if-then structure in CFG, then_bb belongs to
8655          the same loop as if_bb.  It could be different to LOOP when two
8656          level loop-nest is vectorized and mask_store belongs to the inner
8657          one.  */
8658       e = split_block (bb, last);
8659       bb_loop = bb->loop_father;
8660       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
8661       join_bb = e->dest;
8662       store_bb = create_empty_bb (bb);
8663       add_bb_to_loop (store_bb, bb_loop);
8664       e->flags = EDGE_TRUE_VALUE;
8665       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
8666       /* Put STORE_BB to likely part.  */
8667       efalse->probability = profile_probability::unlikely ();
8668       store_bb->count = efalse->count ();
8669       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
8670       if (dom_info_available_p (CDI_DOMINATORS))
8671         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
8672       if (dump_enabled_p ())
8673         dump_printf_loc (MSG_NOTE, vect_location,
8674                          "Create new block %d to sink mask stores.",
8675                          store_bb->index);
8676       /* Create vector comparison with boolean result.  */
8677       vectype = TREE_TYPE (mask);
8678       zero = build_zero_cst (vectype);
8679       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
8680       gsi = gsi_last_bb (bb);
8681       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
8682       /* Create new PHI node for vdef of the last masked store:
8683          .MEM_2 = VDEF <.MEM_1>
8684          will be converted to
8685          .MEM.3 = VDEF <.MEM_1>
8686          and new PHI node will be created in join bb
8687          .MEM_2 = PHI <.MEM_1, .MEM_3>
8688       */
8689       vdef = gimple_vdef (last);
8690       new_vdef = make_ssa_name (gimple_vop (cfun), last);
8691       gimple_set_vdef (last, new_vdef);
8692       phi = create_phi_node (vdef, join_bb);
8693       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
8694
8695       /* Put all masked stores with the same mask to STORE_BB if possible.  */
8696       while (true)
8697         {
8698           gimple_stmt_iterator gsi_from;
8699           gimple *stmt1 = NULL;
8700
8701           /* Move masked store to STORE_BB.  */
8702           last_store = last;
8703           gsi = gsi_for_stmt (last);
8704           gsi_from = gsi;
8705           /* Shift GSI to the previous stmt for further traversal.  */
8706           gsi_prev (&gsi);
8707           gsi_to = gsi_start_bb (store_bb);
8708           gsi_move_before (&gsi_from, &gsi_to);
8709           /* Setup GSI_TO to the non-empty block start.  */
8710           gsi_to = gsi_start_bb (store_bb);
8711           if (dump_enabled_p ())
8712             dump_printf_loc (MSG_NOTE, vect_location,
8713                              "Move stmt to created bb\n%G", last);
8714           /* Move all stored value producers if possible.  */
8715           while (!gsi_end_p (gsi))
8716             {
8717               tree lhs;
8718               imm_use_iterator imm_iter;
8719               use_operand_p use_p;
8720               bool res;
8721
8722               /* Skip debug statements.  */
8723               if (is_gimple_debug (gsi_stmt (gsi)))
8724                 {
8725                   gsi_prev (&gsi);
8726                   continue;
8727                 }
8728               stmt1 = gsi_stmt (gsi);
8729               /* Do not consider statements writing to memory or having
8730                  volatile operand.  */
8731               if (gimple_vdef (stmt1)
8732                   || gimple_has_volatile_ops (stmt1))
8733                 break;
8734               gsi_from = gsi;
8735               gsi_prev (&gsi);
8736               lhs = gimple_get_lhs (stmt1);
8737               if (!lhs)
8738                 break;
8739
8740               /* LHS of vectorized stmt must be SSA_NAME.  */
8741               if (TREE_CODE (lhs) != SSA_NAME)
8742                 break;
8743
8744               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8745                 {
8746                   /* Remove dead scalar statement.  */
8747                   if (has_zero_uses (lhs))
8748                     {
8749                       gsi_remove (&gsi_from, true);
8750                       continue;
8751                     }
8752                 }
8753
8754               /* Check that LHS does not have uses outside of STORE_BB.  */
8755               res = true;
8756               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
8757                 {
8758                   gimple *use_stmt;
8759                   use_stmt = USE_STMT (use_p);
8760                   if (is_gimple_debug (use_stmt))
8761                     continue;
8762                   if (gimple_bb (use_stmt) != store_bb)
8763                     {
8764                       res = false;
8765                       break;
8766                     }
8767                 }
8768               if (!res)
8769                 break;
8770
8771               if (gimple_vuse (stmt1)
8772                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
8773                 break;
8774
8775               /* Can move STMT1 to STORE_BB.  */
8776               if (dump_enabled_p ())
8777                 dump_printf_loc (MSG_NOTE, vect_location,
8778                                  "Move stmt to created bb\n%G", stmt1);
8779               gsi_move_before (&gsi_from, &gsi_to);
8780               /* Shift GSI_TO for further insertion.  */
8781               gsi_prev (&gsi_to);
8782             }
8783           /* Put other masked stores with the same mask to STORE_BB.  */
8784           if (worklist.is_empty ()
8785               || gimple_call_arg (worklist.last (), 2) != mask
8786               || worklist.last () != stmt1)
8787             break;
8788           last = worklist.pop ();
8789         }
8790       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
8791     }
8792 }