gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2019 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "cfghooks.h"
  31 #include "tree-pass.h"
  32 #include "ssa.h"
  33 #include "optabs-tree.h"
  34 #include "diagnostic-core.h"
  35 #include "fold-const.h"
  36 #include "stor-layout.h"
  37 #include "cfganal.h"
  38 #include "gimplify.h"
  39 #include "gimple-iterator.h"
  40 #include "gimplify-me.h"
  41 #include "tree-ssa-loop-ivopts.h"
  42 #include "tree-ssa-loop-manip.h"
  43 #include "tree-ssa-loop-niter.h"
  44 #include "tree-ssa-loop.h"
  45 #include "cfgloop.h"
  46 #include "params.h"
  47 #include "tree-scalar-evolution.h"
  48 #include "tree-vectorizer.h"
  49 #include "gimple-fold.h"
  50 #include "cgraph.h"
  51 #include "tree-cfg.h"
  52 #include "tree-if-conv.h"
  53 #include "internal-fn.h"
  54 #include "tree-vector-builder.h"
  55 #include "vec-perm-indices.h"
  56 #include "tree-eh.h"
  57
  58 /* Loop Vectorization Pass.
  59
  60    This pass tries to vectorize loops.
  61
  62    For example, the vectorizer transforms the following simple loop:
  63
  64         short a[N]; short b[N]; short c[N]; int i;
  65
  66         for (i=0; i<N; i++){
  67           a[i] = b[i] + c[i];
  68         }
  69
  70    as if it was manually vectorized by rewriting the source code into:
  71
  72         typedef int __attribute__((mode(V8HI))) v8hi;
  73         short a[N];  short b[N]; short c[N];   int i;
  74         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  75         v8hi va, vb, vc;
  76
  77         for (i=0; i<N/8; i++){
  78           vb = pb[i];
  79           vc = pc[i];
  80           va = vb + vc;
  81           pa[i] = va;
  82         }
  83
  84         The main entry to this pass is vectorize_loops(), in which
  85    the vectorizer applies a set of analyses on a given set of loops,
  86    followed by the actual vectorization transformation for the loops that
  87    had successfully passed the analysis phase.
  88         Throughout this pass we make a distinction between two types of
  89    data: scalars (which are represented by SSA_NAMES), and memory references
  90    ("data-refs").  These two types of data require different handling both
  91    during analysis and transformation. The types of data-refs that the
  92    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  93    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  94    accesses are required to have a simple (consecutive) access pattern.
  95
  96    Analysis phase:
  97    ===============
  98         The driver for the analysis phase is vect_analyze_loop().
  99    It applies a set of analyses, some of which rely on the scalar evolution
 100    analyzer (scev) developed by Sebastian Pop.
 101
 102         During the analysis phase the vectorizer records some information
 103    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 104    loop, as well as general information about the loop as a whole, which is
 105    recorded in a "loop_vec_info" struct attached to each loop.
 106
 107    Transformation phase:
 108    =====================
 109         The loop transformation phase scans all the stmts in the loop, and
 110    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 111    the loop that needs to be vectorized.  It inserts the vector code sequence
 112    just before the scalar stmt S, and records a pointer to the vector code
 113    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 114    attached to S).  This pointer will be used for the vectorization of following
 115    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 116    otherwise, we rely on dead code elimination for removing it.
 117
 118         For example, say stmt S1 was vectorized into stmt VS1:
 119
 120    VS1: vb = px[i];
 121    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 122    S2:  a = b;
 123
 124    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 125    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 126    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 127    resulting sequence would be:
 128
 129    VS1: vb = px[i];
 130    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 131    VS2: va = vb;
 132    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 133
 134         Operands that are not SSA_NAMEs, are data-refs that appear in
 135    load/store operations (like 'x[i]' in S1), and are handled differently.
 136
 137    Target modeling:
 138    =================
 139         Currently the only target specific information that is used is the
 140    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 141    Targets that can support different sizes of vectors, for now will need
 142    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 143    flexibility will be added in the future.
 144
 145         Since we only vectorize operations which vector form can be
 146    expressed using existing tree codes, to verify that an operation is
 147    supported, the vectorizer checks the relevant optab at the relevant
 148    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 149    the value found is CODE_FOR_nothing, then there's no target support, and
 150    we can't vectorize the stmt.
 151
 152    For additional information on this project see:
 153    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 154 */
 155
 156 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 157 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
 158                                                bool *);
 159
 160 /* Subroutine of vect_determine_vf_for_stmt that handles only one
 161    statement.  VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
 162    may already be set for general statements (not just data refs).  */
 163
 164 static opt_result
 165 vect_determine_vf_for_stmt_1 (stmt_vec_info stmt_info,
 166                               bool vectype_maybe_set_p,
 167                               poly_uint64 *vf,
 168                               vec<stmt_vec_info > *mask_producers)
 169 {
 170   gimple *stmt = stmt_info->stmt;
 171
 172   if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 173        && !STMT_VINFO_LIVE_P (stmt_info))
 174       || gimple_clobber_p (stmt))
 175     {
 176       if (dump_enabled_p ())
 177         dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 178       return opt_result::success ();
 179     }
 180
 181   tree stmt_vectype, nunits_vectype;
 182   opt_result res = vect_get_vector_types_for_stmt (stmt_info, &stmt_vectype,
 183                                                    &nunits_vectype);
 184   if (!res)
 185     return res;
 186
 187   if (stmt_vectype)
 188     {
 189       if (STMT_VINFO_VECTYPE (stmt_info))
 190         /* The only case when a vectype had been already set is for stmts
 191            that contain a data ref, or for "pattern-stmts" (stmts generated
 192            by the vectorizer to represent/replace a certain idiom).  */
 193         gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
 194                      || vectype_maybe_set_p)
 195                     && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
 196       else if (stmt_vectype == boolean_type_node)
 197         mask_producers->safe_push (stmt_info);
 198       else
 199         STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
 200     }
 201
 202   if (nunits_vectype)
 203     vect_update_max_nunits (vf, nunits_vectype);
 204
 205   return opt_result::success ();
 206 }
 207
 208 /* Subroutine of vect_determine_vectorization_factor.  Set the vector
 209    types of STMT_INFO and all attached pattern statements and update
 210    the vectorization factor VF accordingly.  If some of the statements
 211    produce a mask result whose vector type can only be calculated later,
 212    add them to MASK_PRODUCERS.  Return true on success or false if
 213    something prevented vectorization.  */
 214
 215 static opt_result
 216 vect_determine_vf_for_stmt (stmt_vec_info stmt_info, poly_uint64 *vf,
 217                             vec<stmt_vec_info > *mask_producers)
 218 {
 219   vec_info *vinfo = stmt_info->vinfo;
 220   if (dump_enabled_p ())
 221     dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
 222                      stmt_info->stmt);
 223   opt_result res
 224     = vect_determine_vf_for_stmt_1 (stmt_info, false, vf, mask_producers);
 225   if (!res)
 226     return res;
 227
 228   if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 229       && STMT_VINFO_RELATED_STMT (stmt_info))
 230     {
 231       gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 232       stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
 233
 234       /* If a pattern statement has def stmts, analyze them too.  */
 235       for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
 236            !gsi_end_p (si); gsi_next (&si))
 237         {
 238           stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
 239           if (dump_enabled_p ())
 240             dump_printf_loc (MSG_NOTE, vect_location,
 241                              "==> examining pattern def stmt: %G",
 242                              def_stmt_info->stmt);
 243           if (!vect_determine_vf_for_stmt_1 (def_stmt_info, true,
 244                                              vf, mask_producers))
 245           res = vect_determine_vf_for_stmt_1 (def_stmt_info, true,
 246                                               vf, mask_producers);
 247           if (!res)
 248             return res;
 249         }
 250
 251       if (dump_enabled_p ())
 252         dump_printf_loc (MSG_NOTE, vect_location,
 253                          "==> examining pattern statement: %G",
 254                          stmt_info->stmt);
 255       res = vect_determine_vf_for_stmt_1 (stmt_info, true, vf, mask_producers);
 256       if (!res)
 257         return res;
 258     }
 259
 260   return opt_result::success ();
 261 }
 262
 263 /* Function vect_determine_vectorization_factor
 264
 265    Determine the vectorization factor (VF).  VF is the number of data elements
 266    that are operated upon in parallel in a single iteration of the vectorized
 267    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 268    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 269    elements can fit in a single vector register.
 270
 271    We currently support vectorization of loops in which all types operated upon
 272    are of the same size.  Therefore this function currently sets VF according to
 273    the size of the types operated upon, and fails if there are multiple sizes
 274    in the loop.
 275
 276    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 277    original loop:
 278         for (i=0; i<N; i++){
 279           a[i] = b[i] + c[i];
 280         }
 281
 282    vectorized loop:
 283         for (i=0; i<N; i+=VF){
 284           a[i:VF] = b[i:VF] + c[i:VF];
 285         }
 286 */
 287
 288 static opt_result
 289 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 290 {
 291   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 292   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 293   unsigned nbbs = loop->num_nodes;
 294   poly_uint64 vectorization_factor = 1;
 295   tree scalar_type = NULL_TREE;
 296   gphi *phi;
 297   tree vectype;
 298   stmt_vec_info stmt_info;
 299   unsigned i;
 300   auto_vec<stmt_vec_info> mask_producers;
 301
 302   DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
 303
 304   for (i = 0; i < nbbs; i++)
 305     {
 306       basic_block bb = bbs[i];
 307
 308       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 309            gsi_next (&si))
 310         {
 311           phi = si.phi ();
 312           stmt_info = loop_vinfo->lookup_stmt (phi);
 313           if (dump_enabled_p ())
 314             dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
 315                              phi);
 316
 317           gcc_assert (stmt_info);
 318
 319           if (STMT_VINFO_RELEVANT_P (stmt_info)
 320               || STMT_VINFO_LIVE_P (stmt_info))
 321             {
 322               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 323               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 324
 325               if (dump_enabled_p ())
 326                 dump_printf_loc (MSG_NOTE, vect_location,
 327                                  "get vectype for scalar type:  %T\n",
 328                                  scalar_type);
 329
 330               vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
 331               if (!vectype)
 332                 return opt_result::failure_at (phi,
 333                                                "not vectorized: unsupported "
 334                                                "data-type %T\n",
 335                                                scalar_type);
 336               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 337
 338               if (dump_enabled_p ())
 339                 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
 340                                  vectype);
 341
 342               if (dump_enabled_p ())
 343                 {
 344                   dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
 345                   dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
 346                   dump_printf (MSG_NOTE, "\n");
 347                 }
 348
 349               vect_update_max_nunits (&vectorization_factor, vectype);
 350             }
 351         }
 352
 353       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
 354            gsi_next (&si))
 355         {
 356           stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
 357           opt_result res
 358             = vect_determine_vf_for_stmt (stmt_info, &vectorization_factor,
 359                                           &mask_producers);
 360           if (!res)
 361             return res;
 362         }
 363     }
 364
 365   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 366   if (dump_enabled_p ())
 367     {
 368       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
 369       dump_dec (MSG_NOTE, vectorization_factor);
 370       dump_printf (MSG_NOTE, "\n");
 371     }
 372
 373   if (known_le (vectorization_factor, 1U))
 374     return opt_result::failure_at (vect_location,
 375                                    "not vectorized: unsupported data-type\n");
 376   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 377
 378   for (i = 0; i < mask_producers.length (); i++)
 379     {
 380       stmt_info = mask_producers[i];
 381       opt_tree mask_type = vect_get_mask_type_for_stmt (stmt_info);
 382       if (!mask_type)
 383         return opt_result::propagate_failure (mask_type);
 384       STMT_VINFO_VECTYPE (stmt_info) = mask_type;
 385     }
 386
 387   return opt_result::success ();
 388 }
 389
 390
 391 /* Function vect_is_simple_iv_evolution.
 392
 393    FORNOW: A simple evolution of an induction variables in the loop is
 394    considered a polynomial evolution.  */
 395
 396 static bool
 397 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 398                              tree * step)
 399 {
 400   tree init_expr;
 401   tree step_expr;
 402   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 403   basic_block bb;
 404
 405   /* When there is no evolution in this loop, the evolution function
 406      is not "simple".  */
 407   if (evolution_part == NULL_TREE)
 408     return false;
 409
 410   /* When the evolution is a polynomial of degree >= 2
 411      the evolution function is not "simple".  */
 412   if (tree_is_chrec (evolution_part))
 413     return false;
 414
 415   step_expr = evolution_part;
 416   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 417
 418   if (dump_enabled_p ())
 419     dump_printf_loc (MSG_NOTE, vect_location, "step: %T,  init: %T\n",
 420                      step_expr, init_expr);
 421
 422   *init = init_expr;
 423   *step = step_expr;
 424
 425   if (TREE_CODE (step_expr) != INTEGER_CST
 426       && (TREE_CODE (step_expr) != SSA_NAME
 427           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 428               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 429           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 430               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 431                   || !flag_associative_math)))
 432       && (TREE_CODE (step_expr) != REAL_CST
 433           || !flag_associative_math))
 434     {
 435       if (dump_enabled_p ())
 436         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 437                          "step unknown.\n");
 438       return false;
 439     }
 440
 441   return true;
 442 }
 443
 444 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
 445    what we are assuming is a double reduction.  For example, given
 446    a structure like this:
 447
 448       outer1:
 449         x_1 = PHI <x_4(outer2), ...>;
 450         ...
 451
 452       inner:
 453         x_2 = PHI <x_1(outer1), ...>;
 454         ...
 455         x_3 = ...;
 456         ...
 457
 458       outer2:
 459         x_4 = PHI <x_3(inner)>;
 460         ...
 461
 462    outer loop analysis would treat x_1 as a double reduction phi and
 463    this function would then return true for x_2.  */
 464
 465 static bool
 466 vect_inner_phi_in_double_reduction_p (stmt_vec_info stmt_info, gphi *phi)
 467 {
 468   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
 469   use_operand_p use_p;
 470   ssa_op_iter op_iter;
 471   FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
 472     if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
 473       if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
 474         return true;
 475   return false;
 476 }
 477
 478 /* Function vect_analyze_scalar_cycles_1.
 479
 480    Examine the cross iteration def-use cycles of scalar variables
 481    in LOOP.  LOOP_VINFO represents the loop that is now being
 482    considered for vectorization (can be LOOP, or an outer-loop
 483    enclosing LOOP).  */
 484
 485 static void
 486 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop)
 487 {
 488   basic_block bb = loop->header;
 489   tree init, step;
 490   auto_vec<stmt_vec_info, 64> worklist;
 491   gphi_iterator gsi;
 492   bool double_reduc;
 493
 494   DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
 495
 496   /* First - identify all inductions.  Reduction detection assumes that all the
 497      inductions have been identified, therefore, this order must not be
 498      changed.  */
 499   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 500     {
 501       gphi *phi = gsi.phi ();
 502       tree access_fn = NULL;
 503       tree def = PHI_RESULT (phi);
 504       stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
 505
 506       if (dump_enabled_p ())
 507         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
 508
 509       /* Skip virtual phi's.  The data dependences that are associated with
 510          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 511       if (virtual_operand_p (def))
 512         continue;
 513
 514       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 515
 516       /* Analyze the evolution function.  */
 517       access_fn = analyze_scalar_evolution (loop, def);
 518       if (access_fn)
 519         {
 520           STRIP_NOPS (access_fn);
 521           if (dump_enabled_p ())
 522             dump_printf_loc (MSG_NOTE, vect_location,
 523                              "Access function of PHI: %T\n", access_fn);
 524           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 525             = initial_condition_in_loop_num (access_fn, loop->num);
 526           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 527             = evolution_part_in_loop_num (access_fn, loop->num);
 528         }
 529
 530       if (!access_fn
 531           || vect_inner_phi_in_double_reduction_p (stmt_vinfo, phi)
 532           || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
 533           || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 534               && TREE_CODE (step) != INTEGER_CST))
 535         {
 536           worklist.safe_push (stmt_vinfo);
 537           continue;
 538         }
 539
 540       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 541                   != NULL_TREE);
 542       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 543
 544       if (dump_enabled_p ())
 545         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 546       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 547     }
 548
 549
 550   /* Second - identify all reductions and nested cycles.  */
 551   while (worklist.length () > 0)
 552     {
 553       stmt_vec_info stmt_vinfo = worklist.pop ();
 554       gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
 555       tree def = PHI_RESULT (phi);
 556
 557       if (dump_enabled_p ())
 558         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
 559
 560       gcc_assert (!virtual_operand_p (def)
 561                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 562
 563       stmt_vec_info reduc_stmt_info
 564         = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc);
 565       if (reduc_stmt_info)
 566         {
 567           STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
 568           STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
 569           if (double_reduc)
 570             {
 571               if (dump_enabled_p ())
 572                 dump_printf_loc (MSG_NOTE, vect_location,
 573                                  "Detected double reduction.\n");
 574
 575               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 576               STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
 577             }
 578           else
 579             {
 580               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
 581                 {
 582                   if (dump_enabled_p ())
 583                     dump_printf_loc (MSG_NOTE, vect_location,
 584                                      "Detected vectorizable nested cycle.\n");
 585
 586                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 587                 }
 588               else
 589                 {
 590                   if (dump_enabled_p ())
 591                     dump_printf_loc (MSG_NOTE, vect_location,
 592                                      "Detected reduction.\n");
 593
 594                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 595                   STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
 596                   /* Store the reduction cycles for possible vectorization in
 597                      loop-aware SLP if it was not detected as reduction
 598                      chain.  */
 599                   if (! REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
 600                     LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
 601                       (reduc_stmt_info);
 602                 }
 603             }
 604         }
 605       else
 606         if (dump_enabled_p ())
 607           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 608                            "Unknown def-use cycle pattern.\n");
 609     }
 610 }
 611
 612
 613 /* Function vect_analyze_scalar_cycles.
 614
 615    Examine the cross iteration def-use cycles of scalar variables, by
 616    analyzing the loop-header PHIs of scalar variables.  Classify each
 617    cycle as one of the following: invariant, induction, reduction, unknown.
 618    We do that for the loop represented by LOOP_VINFO, and also to its
 619    inner-loop, if exists.
 620    Examples for scalar cycles:
 621
 622    Example1: reduction:
 623
 624               loop1:
 625               for (i=0; i<N; i++)
 626                  sum += a[i];
 627
 628    Example2: induction:
 629
 630               loop2:
 631               for (i=0; i<N; i++)
 632                  a[i] = i;  */
 633
 634 static void
 635 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 636 {
 637   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 638
 639   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 640
 641   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 642      Reductions in such inner-loop therefore have different properties than
 643      the reductions in the nest that gets vectorized:
 644      1. When vectorized, they are executed in the same order as in the original
 645         scalar loop, so we can't change the order of computation when
 646         vectorizing them.
 647      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 648         current checks are too strict.  */
 649
 650   if (loop->inner)
 651     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 652 }
 653
 654 /* Transfer group and reduction information from STMT_INFO to its
 655    pattern stmt.  */
 656
 657 static void
 658 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
 659 {
 660   stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
 661   stmt_vec_info stmtp;
 662   gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
 663               && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
 664   REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
 665   do
 666     {
 667       stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
 668       REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
 669       stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
 670       if (stmt_info)
 671         REDUC_GROUP_NEXT_ELEMENT (stmtp)
 672           = STMT_VINFO_RELATED_STMT (stmt_info);
 673     }
 674   while (stmt_info);
 675   STMT_VINFO_DEF_TYPE (stmtp) = vect_reduction_def;
 676 }
 677
 678 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 679
 680 static void
 681 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 682 {
 683   stmt_vec_info first;
 684   unsigned i;
 685
 686   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 687     if (STMT_VINFO_IN_PATTERN_P (first))
 688       {
 689         stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
 690         while (next)
 691           {
 692             if (! STMT_VINFO_IN_PATTERN_P (next))
 693               break;
 694             next = REDUC_GROUP_NEXT_ELEMENT (next);
 695           }
 696         /* If not all stmt in the chain are patterns try to handle
 697            the chain without patterns.  */
 698         if (! next)
 699           {
 700             vect_fixup_reduc_chain (first);
 701             LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
 702               = STMT_VINFO_RELATED_STMT (first);
 703           }
 704       }
 705 }
 706
 707 /* Function vect_get_loop_niters.
 708
 709    Determine how many iterations the loop is executed and place it
 710    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
 711    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
 712    niter information holds in ASSUMPTIONS.
 713
 714    Return the loop exit condition.  */
 715
 716
 717 static gcond *
 718 vect_get_loop_niters (class loop *loop, tree *assumptions,
 719                       tree *number_of_iterations, tree *number_of_iterationsm1)
 720 {
 721   edge exit = single_exit (loop);
 722   class tree_niter_desc niter_desc;
 723   tree niter_assumptions, niter, may_be_zero;
 724   gcond *cond = get_loop_exit_condition (loop);
 725
 726   *assumptions = boolean_true_node;
 727   *number_of_iterationsm1 = chrec_dont_know;
 728   *number_of_iterations = chrec_dont_know;
 729   DUMP_VECT_SCOPE ("get_loop_niters");
 730
 731   if (!exit)
 732     return cond;
 733
 734   may_be_zero = NULL_TREE;
 735   if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
 736       || chrec_contains_undetermined (niter_desc.niter))
 737     return cond;
 738
 739   niter_assumptions = niter_desc.assumptions;
 740   may_be_zero = niter_desc.may_be_zero;
 741   niter = niter_desc.niter;
 742
 743   if (may_be_zero && integer_zerop (may_be_zero))
 744     may_be_zero = NULL_TREE;
 745
 746   if (may_be_zero)
 747     {
 748       if (COMPARISON_CLASS_P (may_be_zero))
 749         {
 750           /* Try to combine may_be_zero with assumptions, this can simplify
 751              computation of niter expression.  */
 752           if (niter_assumptions && !integer_nonzerop (niter_assumptions))
 753             niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
 754                                              niter_assumptions,
 755                                              fold_build1 (TRUTH_NOT_EXPR,
 756                                                           boolean_type_node,
 757                                                           may_be_zero));
 758           else
 759             niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
 760                                  build_int_cst (TREE_TYPE (niter), 0),
 761                                  rewrite_to_non_trapping_overflow (niter));
 762
 763           may_be_zero = NULL_TREE;
 764         }
 765       else if (integer_nonzerop (may_be_zero))
 766         {
 767           *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
 768           *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
 769           return cond;
 770         }
 771       else
 772         return cond;
 773     }
 774
 775   *assumptions = niter_assumptions;
 776   *number_of_iterationsm1 = niter;
 777
 778   /* We want the number of loop header executions which is the number
 779      of latch executions plus one.
 780      ???  For UINT_MAX latch executions this number overflows to zero
 781      for loops like do { n++; } while (n != 0);  */
 782   if (niter && !chrec_contains_undetermined (niter))
 783     niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
 784                           build_int_cst (TREE_TYPE (niter), 1));
 785   *number_of_iterations = niter;
 786
 787   return cond;
 788 }
 789
 790 /* Function bb_in_loop_p
 791
 792    Used as predicate for dfs order traversal of the loop bbs.  */
 793
 794 static bool
 795 bb_in_loop_p (const_basic_block bb, const void *data)
 796 {
 797   const class loop *const loop = (const class loop *)data;
 798   if (flow_bb_inside_loop_p (loop, bb))
 799     return true;
 800   return false;
 801 }
 802
 803
 804 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
 805    stmt_vec_info structs for all the stmts in LOOP_IN.  */
 806
 807 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
 808   : vec_info (vec_info::loop, init_cost (loop_in), shared),
 809     loop (loop_in),
 810     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
 811     num_itersm1 (NULL_TREE),
 812     num_iters (NULL_TREE),
 813     num_iters_unchanged (NULL_TREE),
 814     num_iters_assumptions (NULL_TREE),
 815     th (0),
 816     versioning_threshold (0),
 817     vectorization_factor (0),
 818     max_vectorization_factor (0),
 819     mask_skip_niters (NULL_TREE),
 820     mask_compare_type (NULL_TREE),
 821     simd_if_cond (NULL_TREE),
 822     unaligned_dr (NULL),
 823     peeling_for_alignment (0),
 824     ptr_mask (0),
 825     ivexpr_map (NULL),
 826     scan_map (NULL),
 827     slp_unrolling_factor (1),
 828     single_scalar_iteration_cost (0),
 829     vectorizable (false),
 830     can_fully_mask_p (true),
 831     fully_masked_p (false),
 832     peeling_for_gaps (false),
 833     peeling_for_niter (false),
 834     no_data_dependencies (false),
 835     has_mask_store (false),
 836     scalar_loop_scaling (profile_probability::uninitialized ()),
 837     scalar_loop (NULL),
 838     orig_loop_info (NULL)
 839 {
 840   /* CHECKME: We want to visit all BBs before their successors (except for
 841      latch blocks, for which this assertion wouldn't hold).  In the simple
 842      case of the loop forms we allow, a dfs order of the BBs would the same
 843      as reversed postorder traversal, so we are safe.  */
 844
 845   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 846                                           bbs, loop->num_nodes, loop);
 847   gcc_assert (nbbs == loop->num_nodes);
 848
 849   for (unsigned int i = 0; i < nbbs; i++)
 850     {
 851       basic_block bb = bbs[i];
 852       gimple_stmt_iterator si;
 853
 854       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 855         {
 856           gimple *phi = gsi_stmt (si);
 857           gimple_set_uid (phi, 0);
 858           add_stmt (phi);
 859         }
 860
 861       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 862         {
 863           gimple *stmt = gsi_stmt (si);
 864           gimple_set_uid (stmt, 0);
 865           add_stmt (stmt);
 866           /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
 867              third argument is the #pragma omp simd if (x) condition, when 0,
 868              loop shouldn't be vectorized, when non-zero constant, it should
 869              be vectorized normally, otherwise versioned with vectorized loop
 870              done if the condition is non-zero at runtime.  */
 871           if (loop_in->simduid
 872               && is_gimple_call (stmt)
 873               && gimple_call_internal_p (stmt)
 874               && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
 875               && gimple_call_num_args (stmt) >= 3
 876               && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
 877               && (loop_in->simduid
 878                   == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
 879             {
 880               tree arg = gimple_call_arg (stmt, 2);
 881               if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
 882                 simd_if_cond = arg;
 883               else
 884                 gcc_assert (integer_nonzerop (arg));
 885             }
 886         }
 887     }
 888 }
 889
 890 /* Free all levels of MASKS.  */
 891
 892 void
 893 release_vec_loop_masks (vec_loop_masks *masks)
 894 {
 895   rgroup_masks *rgm;
 896   unsigned int i;
 897   FOR_EACH_VEC_ELT (*masks, i, rgm)
 898     rgm->masks.release ();
 899   masks->release ();
 900 }
 901
 902 /* Free all memory used by the _loop_vec_info, as well as all the
 903    stmt_vec_info structs of all the stmts in the loop.  */
 904
 905 _loop_vec_info::~_loop_vec_info ()
 906 {
 907   free (bbs);
 908
 909   release_vec_loop_masks (&masks);
 910   delete ivexpr_map;
 911   delete scan_map;
 912
 913   loop->aux = NULL;
 914 }
 915
 916 /* Return an invariant or register for EXPR and emit necessary
 917    computations in the LOOP_VINFO loop preheader.  */
 918
 919 tree
 920 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
 921 {
 922   if (is_gimple_reg (expr)
 923       || is_gimple_min_invariant (expr))
 924     return expr;
 925
 926   if (! loop_vinfo->ivexpr_map)
 927     loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
 928   tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
 929   if (! cached)
 930     {
 931       gimple_seq stmts = NULL;
 932       cached = force_gimple_operand (unshare_expr (expr),
 933                                      &stmts, true, NULL_TREE);
 934       if (stmts)
 935         {
 936           edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
 937           gsi_insert_seq_on_edge_immediate (e, stmts);
 938         }
 939     }
 940   return cached;
 941 }
 942
 943 /* Return true if we can use CMP_TYPE as the comparison type to produce
 944    all masks required to mask LOOP_VINFO.  */
 945
 946 static bool
 947 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
 948 {
 949   rgroup_masks *rgm;
 950   unsigned int i;
 951   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
 952     if (rgm->mask_type != NULL_TREE
 953         && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
 954                                             cmp_type, rgm->mask_type,
 955                                             OPTIMIZE_FOR_SPEED))
 956       return false;
 957   return true;
 958 }
 959
 960 /* Calculate the maximum number of scalars per iteration for every
 961    rgroup in LOOP_VINFO.  */
 962
 963 static unsigned int
 964 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
 965 {
 966   unsigned int res = 1;
 967   unsigned int i;
 968   rgroup_masks *rgm;
 969   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
 970     res = MAX (res, rgm->max_nscalars_per_iter);
 971   return res;
 972 }
 973
 974 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
 975    whether we can actually generate the masks required.  Return true if so,
 976    storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE.  */
 977
 978 static bool
 979 vect_verify_full_masking (loop_vec_info loop_vinfo)
 980 {
 981   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 982   unsigned int min_ni_width;
 983   unsigned int max_nscalars_per_iter
 984     = vect_get_max_nscalars_per_iter (loop_vinfo);
 985
 986   /* Use a normal loop if there are no statements that need masking.
 987      This only happens in rare degenerate cases: it means that the loop
 988      has no loads, no stores, and no live-out values.  */
 989   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
 990     return false;
 991
 992   /* Get the maximum number of iterations that is representable
 993      in the counter type.  */
 994   tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
 995   widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
 996
 997   /* Get a more refined estimate for the number of iterations.  */
 998   widest_int max_back_edges;
 999   if (max_loop_iterations (loop, &max_back_edges))
1000     max_ni = wi::smin (max_ni, max_back_edges + 1);
1001
1002   /* Account for rgroup masks, in which each bit is replicated N times.  */
1003   max_ni *= max_nscalars_per_iter;
1004
1005   /* Work out how many bits we need to represent the limit.  */
1006   min_ni_width = wi::min_precision (max_ni, UNSIGNED);
1007
1008   /* Find a scalar mode for which WHILE_ULT is supported.  */
1009   opt_scalar_int_mode cmp_mode_iter;
1010   tree cmp_type = NULL_TREE;
1011   tree iv_type = NULL_TREE;
1012   widest_int iv_limit = vect_iv_limit_for_full_masking (loop_vinfo);
1013   unsigned int iv_precision = UINT_MAX;
1014
1015   if (iv_limit != -1)
1016     iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1017                                       UNSIGNED);
1018
1019   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1020     {
1021       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1022       if (cmp_bits >= min_ni_width
1023           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1024         {
1025           tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1026           if (this_type
1027               && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1028             {
1029               /* Although we could stop as soon as we find a valid mode,
1030                  there are at least two reasons why that's not always the
1031                  best choice:
1032
1033                  - An IV that's Pmode or wider is more likely to be reusable
1034                    in address calculations than an IV that's narrower than
1035                    Pmode.
1036
1037                  - Doing the comparison in IV_PRECISION or wider allows
1038                    a natural 0-based IV, whereas using a narrower comparison
1039                    type requires mitigations against wrap-around.
1040
1041                  Conversely, if the IV limit is variable, doing the comparison
1042                  in a wider type than the original type can introduce
1043                  unnecessary extensions, so picking the widest valid mode
1044                  is not always a good choice either.
1045
1046                  Here we prefer the first IV type that's Pmode or wider,
1047                  and the first comparison type that's IV_PRECISION or wider.
1048                  (The comparison type must be no wider than the IV type,
1049                  to avoid extensions in the vector loop.)
1050
1051                  ??? We might want to try continuing beyond Pmode for ILP32
1052                  targets if CMP_BITS < IV_PRECISION.  */
1053               iv_type = this_type;
1054               if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1055                 cmp_type = this_type;
1056               if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1057                 break;
1058             }
1059         }
1060     }
1061
1062   if (!cmp_type)
1063     return false;
1064
1065   LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
1066   LOOP_VINFO_MASK_IV_TYPE (loop_vinfo) = iv_type;
1067   return true;
1068 }
1069
1070 /* Calculate the cost of one scalar iteration of the loop.  */
1071 static void
1072 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1073 {
1074   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1075   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1076   int nbbs = loop->num_nodes, factor;
1077   int innerloop_iters, i;
1078
1079   DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1080
1081   /* Gather costs for statements in the scalar loop.  */
1082
1083   /* FORNOW.  */
1084   innerloop_iters = 1;
1085   if (loop->inner)
1086     innerloop_iters = 50; /* FIXME */
1087
1088   for (i = 0; i < nbbs; i++)
1089     {
1090       gimple_stmt_iterator si;
1091       basic_block bb = bbs[i];
1092
1093       if (bb->loop_father == loop->inner)
1094         factor = innerloop_iters;
1095       else
1096         factor = 1;
1097
1098       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1099         {
1100           gimple *stmt = gsi_stmt (si);
1101           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1102
1103           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1104             continue;
1105
1106           /* Skip stmts that are not vectorized inside the loop.  */
1107           stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1108           if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1109               && (!STMT_VINFO_LIVE_P (vstmt_info)
1110                   || !VECTORIZABLE_CYCLE_DEF
1111                         (STMT_VINFO_DEF_TYPE (vstmt_info))))
1112             continue;
1113
1114           vect_cost_for_stmt kind;
1115           if (STMT_VINFO_DATA_REF (stmt_info))
1116             {
1117               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1118                kind = scalar_load;
1119              else
1120                kind = scalar_store;
1121             }
1122           else
1123             kind = scalar_stmt;
1124
1125           record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1126                             factor, kind, stmt_info, 0, vect_prologue);
1127         }
1128     }
1129
1130   /* Now accumulate cost.  */
1131   void *target_cost_data = init_cost (loop);
1132   stmt_info_for_cost *si;
1133   int j;
1134   FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1135                     j, si)
1136     (void) add_stmt_cost (target_cost_data, si->count,
1137                           si->kind, si->stmt_info, si->misalign,
1138                           vect_body);
1139   unsigned dummy, body_cost = 0;
1140   finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
1141   destroy_cost_data (target_cost_data);
1142   LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
1143 }
1144
1145
1146 /* Function vect_analyze_loop_form_1.
1147
1148    Verify that certain CFG restrictions hold, including:
1149    - the loop has a pre-header
1150    - the loop has a single entry and exit
1151    - the loop exit condition is simple enough
1152    - the number of iterations can be analyzed, i.e, a countable loop.  The
1153      niter could be analyzed under some assumptions.  */
1154
1155 opt_result
1156 vect_analyze_loop_form_1 (class loop *loop, gcond **loop_cond,
1157                           tree *assumptions, tree *number_of_iterationsm1,
1158                           tree *number_of_iterations, gcond **inner_loop_cond)
1159 {
1160   DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1161
1162   /* Different restrictions apply when we are considering an inner-most loop,
1163      vs. an outer (nested) loop.
1164      (FORNOW. May want to relax some of these restrictions in the future).  */
1165
1166   if (!loop->inner)
1167     {
1168       /* Inner-most loop.  We currently require that the number of BBs is
1169          exactly 2 (the header and latch).  Vectorizable inner-most loops
1170          look like this:
1171
1172                         (pre-header)
1173                            |
1174                           header <--------+
1175                            | |            |
1176                            | +--> latch --+
1177                            |
1178                         (exit-bb)  */
1179
1180       if (loop->num_nodes != 2)
1181         return opt_result::failure_at (vect_location,
1182                                        "not vectorized:"
1183                                        " control flow in loop.\n");
1184
1185       if (empty_block_p (loop->header))
1186         return opt_result::failure_at (vect_location,
1187                                        "not vectorized: empty loop.\n");
1188     }
1189   else
1190     {
1191       class loop *innerloop = loop->inner;
1192       edge entryedge;
1193
1194       /* Nested loop. We currently require that the loop is doubly-nested,
1195          contains a single inner loop, and the number of BBs is exactly 5.
1196          Vectorizable outer-loops look like this:
1197
1198                         (pre-header)
1199                            |
1200                           header <---+
1201                            |         |
1202                           inner-loop |
1203                            |         |
1204                           tail ------+
1205                            |
1206                         (exit-bb)
1207
1208          The inner-loop has the properties expected of inner-most loops
1209          as described above.  */
1210
1211       if ((loop->inner)->inner || (loop->inner)->next)
1212         return opt_result::failure_at (vect_location,
1213                                        "not vectorized:"
1214                                        " multiple nested loops.\n");
1215
1216       if (loop->num_nodes != 5)
1217         return opt_result::failure_at (vect_location,
1218                                        "not vectorized:"
1219                                        " control flow in loop.\n");
1220
1221       entryedge = loop_preheader_edge (innerloop);
1222       if (entryedge->src != loop->header
1223           || !single_exit (innerloop)
1224           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1225         return opt_result::failure_at (vect_location,
1226                                        "not vectorized:"
1227                                        " unsupported outerloop form.\n");
1228
1229       /* Analyze the inner-loop.  */
1230       tree inner_niterm1, inner_niter, inner_assumptions;
1231       opt_result res
1232         = vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1233                                     &inner_assumptions, &inner_niterm1,
1234                                     &inner_niter, NULL);
1235       if (!res)
1236         {
1237           if (dump_enabled_p ())
1238             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1239                              "not vectorized: Bad inner loop.\n");
1240           return res;
1241         }
1242
1243       /* Don't support analyzing niter under assumptions for inner
1244          loop.  */
1245       if (!integer_onep (inner_assumptions))
1246         return opt_result::failure_at (vect_location,
1247                                        "not vectorized: Bad inner loop.\n");
1248
1249       if (!expr_invariant_in_loop_p (loop, inner_niter))
1250         return opt_result::failure_at (vect_location,
1251                                        "not vectorized: inner-loop count not"
1252                                        " invariant.\n");
1253
1254       if (dump_enabled_p ())
1255         dump_printf_loc (MSG_NOTE, vect_location,
1256                          "Considering outer-loop vectorization.\n");
1257     }
1258
1259   if (!single_exit (loop))
1260     return opt_result::failure_at (vect_location,
1261                                    "not vectorized: multiple exits.\n");
1262   if (EDGE_COUNT (loop->header->preds) != 2)
1263     return opt_result::failure_at (vect_location,
1264                                    "not vectorized:"
1265                                    " too many incoming edges.\n");
1266
1267   /* We assume that the loop exit condition is at the end of the loop. i.e,
1268      that the loop is represented as a do-while (with a proper if-guard
1269      before the loop if needed), where the loop header contains all the
1270      executable statements, and the latch is empty.  */
1271   if (!empty_block_p (loop->latch)
1272       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1273     return opt_result::failure_at (vect_location,
1274                                    "not vectorized: latch block not empty.\n");
1275
1276   /* Make sure the exit is not abnormal.  */
1277   edge e = single_exit (loop);
1278   if (e->flags & EDGE_ABNORMAL)
1279     return opt_result::failure_at (vect_location,
1280                                    "not vectorized:"
1281                                    " abnormal loop exit edge.\n");
1282
1283   *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1284                                      number_of_iterationsm1);
1285   if (!*loop_cond)
1286     return opt_result::failure_at
1287       (vect_location,
1288        "not vectorized: complicated exit condition.\n");
1289
1290   if (integer_zerop (*assumptions)
1291       || !*number_of_iterations
1292       || chrec_contains_undetermined (*number_of_iterations))
1293     return opt_result::failure_at
1294       (*loop_cond,
1295        "not vectorized: number of iterations cannot be computed.\n");
1296
1297   if (integer_zerop (*number_of_iterations))
1298     return opt_result::failure_at
1299       (*loop_cond,
1300        "not vectorized: number of iterations = 0.\n");
1301
1302   return opt_result::success ();
1303 }
1304
1305 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form.  */
1306
1307 opt_loop_vec_info
1308 vect_analyze_loop_form (class loop *loop, vec_info_shared *shared)
1309 {
1310   tree assumptions, number_of_iterations, number_of_iterationsm1;
1311   gcond *loop_cond, *inner_loop_cond = NULL;
1312
1313   opt_result res
1314     = vect_analyze_loop_form_1 (loop, &loop_cond,
1315                                 &assumptions, &number_of_iterationsm1,
1316                                 &number_of_iterations, &inner_loop_cond);
1317   if (!res)
1318     return opt_loop_vec_info::propagate_failure (res);
1319
1320   loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1321   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1322   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1323   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1324   if (!integer_onep (assumptions))
1325     {
1326       /* We consider to vectorize this loop by versioning it under
1327          some assumptions.  In order to do this, we need to clear
1328          existing information computed by scev and niter analyzer.  */
1329       scev_reset_htab ();
1330       free_numbers_of_iterations_estimates (loop);
1331       /* Also set flag for this loop so that following scev and niter
1332          analysis are done under the assumptions.  */
1333       loop_constraint_set (loop, LOOP_C_FINITE);
1334       /* Also record the assumptions for versioning.  */
1335       LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1336     }
1337
1338   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1339     {
1340       if (dump_enabled_p ())
1341         {
1342           dump_printf_loc (MSG_NOTE, vect_location,
1343                            "Symbolic number of iterations is ");
1344           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1345           dump_printf (MSG_NOTE, "\n");
1346         }
1347     }
1348
1349   stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (loop_cond);
1350   STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1351   if (inner_loop_cond)
1352     {
1353       stmt_vec_info inner_loop_cond_info
1354         = loop_vinfo->lookup_stmt (inner_loop_cond);
1355       STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1356     }
1357
1358   gcc_assert (!loop->aux);
1359   loop->aux = loop_vinfo;
1360   return opt_loop_vec_info::success (loop_vinfo);
1361 }
1362
1363
1364
1365 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1366    statements update the vectorization factor.  */
1367
1368 static void
1369 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1370 {
1371   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1372   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1373   int nbbs = loop->num_nodes;
1374   poly_uint64 vectorization_factor;
1375   int i;
1376
1377   DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1378
1379   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1380   gcc_assert (known_ne (vectorization_factor, 0U));
1381
1382   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1383      vectorization factor of the loop is the unrolling factor required by
1384      the SLP instances.  If that unrolling factor is 1, we say, that we
1385      perform pure SLP on loop - cross iteration parallelism is not
1386      exploited.  */
1387   bool only_slp_in_loop = true;
1388   for (i = 0; i < nbbs; i++)
1389     {
1390       basic_block bb = bbs[i];
1391       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1392            gsi_next (&si))
1393         {
1394           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1395           stmt_info = vect_stmt_to_vectorize (stmt_info);
1396           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1397                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1398               && !PURE_SLP_STMT (stmt_info))
1399             /* STMT needs both SLP and loop-based vectorization.  */
1400             only_slp_in_loop = false;
1401         }
1402     }
1403
1404   if (only_slp_in_loop)
1405     {
1406       if (dump_enabled_p ())
1407         dump_printf_loc (MSG_NOTE, vect_location,
1408                          "Loop contains only SLP stmts\n");
1409       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1410     }
1411   else
1412     {
1413       if (dump_enabled_p ())
1414         dump_printf_loc (MSG_NOTE, vect_location,
1415                          "Loop contains SLP and non-SLP stmts\n");
1416       /* Both the vectorization factor and unroll factor have the form
1417          loop_vinfo->vector_size * X for some rational X, so they must have
1418          a common multiple.  */
1419       vectorization_factor
1420         = force_common_multiple (vectorization_factor,
1421                                  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1422     }
1423
1424   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1425   if (dump_enabled_p ())
1426     {
1427       dump_printf_loc (MSG_NOTE, vect_location,
1428                        "Updating vectorization factor to ");
1429       dump_dec (MSG_NOTE, vectorization_factor);
1430       dump_printf (MSG_NOTE, ".\n");
1431     }
1432 }
1433
1434 /* Return true if STMT_INFO describes a double reduction phi and if
1435    the other phi in the reduction is also relevant for vectorization.
1436    This rejects cases such as:
1437
1438       outer1:
1439         x_1 = PHI <x_3(outer2), ...>;
1440         ...
1441
1442       inner:
1443         x_2 = ...;
1444         ...
1445
1446       outer2:
1447         x_3 = PHI <x_2(inner)>;
1448
1449    if nothing in x_2 or elsewhere makes x_1 relevant.  */
1450
1451 static bool
1452 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1453 {
1454   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1455     return false;
1456
1457   return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1458 }
1459
1460 /* Function vect_analyze_loop_operations.
1461
1462    Scan the loop stmts and make sure they are all vectorizable.  */
1463
1464 static opt_result
1465 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1466 {
1467   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1468   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1469   int nbbs = loop->num_nodes;
1470   int i;
1471   stmt_vec_info stmt_info;
1472   bool need_to_vectorize = false;
1473   bool ok;
1474
1475   DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1476
1477   auto_vec<stmt_info_for_cost> cost_vec;
1478
1479   for (i = 0; i < nbbs; i++)
1480     {
1481       basic_block bb = bbs[i];
1482
1483       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1484            gsi_next (&si))
1485         {
1486           gphi *phi = si.phi ();
1487           ok = true;
1488
1489           stmt_info = loop_vinfo->lookup_stmt (phi);
1490           if (dump_enabled_p ())
1491             dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G", phi);
1492           if (virtual_operand_p (gimple_phi_result (phi)))
1493             continue;
1494
1495           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1496              (i.e., a phi in the tail of the outer-loop).  */
1497           if (! is_loop_header_bb_p (bb))
1498             {
1499               /* FORNOW: we currently don't support the case that these phis
1500                  are not used in the outerloop (unless it is double reduction,
1501                  i.e., this phi is vect_reduction_def), cause this case
1502                  requires to actually do something here.  */
1503               if (STMT_VINFO_LIVE_P (stmt_info)
1504                   && !vect_active_double_reduction_p (stmt_info))
1505                 return opt_result::failure_at (phi,
1506                                                "Unsupported loop-closed phi"
1507                                                " in outer-loop.\n");
1508
1509               /* If PHI is used in the outer loop, we check that its operand
1510                  is defined in the inner loop.  */
1511               if (STMT_VINFO_RELEVANT_P (stmt_info))
1512                 {
1513                   tree phi_op;
1514
1515                   if (gimple_phi_num_args (phi) != 1)
1516                     return opt_result::failure_at (phi, "unsupported phi");
1517
1518                   phi_op = PHI_ARG_DEF (phi, 0);
1519                   stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1520                   if (!op_def_info)
1521                     return opt_result::failure_at (phi, "unsupported phi\n");
1522
1523                   if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1524                       && (STMT_VINFO_RELEVANT (op_def_info)
1525                           != vect_used_in_outer_by_reduction))
1526                     return opt_result::failure_at (phi, "unsupported phi\n");
1527
1528                   if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
1529                        || (STMT_VINFO_DEF_TYPE (stmt_info)
1530                            == vect_double_reduction_def))
1531                       && !vectorizable_lc_phi (stmt_info, NULL, NULL))
1532                     return opt_result::failure_at (phi, "unsupported phi\n");
1533                 }
1534
1535               continue;
1536             }
1537
1538           gcc_assert (stmt_info);
1539
1540           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1541                || STMT_VINFO_LIVE_P (stmt_info))
1542               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1543             /* A scalar-dependence cycle that we don't support.  */
1544             return opt_result::failure_at (phi,
1545                                            "not vectorized:"
1546                                            " scalar dependence cycle.\n");
1547
1548           if (STMT_VINFO_RELEVANT_P (stmt_info))
1549             {
1550               need_to_vectorize = true;
1551               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1552                   && ! PURE_SLP_STMT (stmt_info))
1553                 ok = vectorizable_induction (stmt_info, NULL, NULL, NULL,
1554                                              &cost_vec);
1555               else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1556                         || (STMT_VINFO_DEF_TYPE (stmt_info)
1557                             == vect_double_reduction_def)
1558                         || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1559                        && ! PURE_SLP_STMT (stmt_info))
1560                 ok = vectorizable_reduction (stmt_info, NULL, NULL, &cost_vec);
1561             }
1562
1563           /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
1564           if (ok
1565               && STMT_VINFO_LIVE_P (stmt_info)
1566               && !PURE_SLP_STMT (stmt_info))
1567             ok = vectorizable_live_operation (stmt_info, NULL, NULL, NULL,
1568                                               -1, false, &cost_vec);
1569
1570           if (!ok)
1571             return opt_result::failure_at (phi,
1572                                            "not vectorized: relevant phi not "
1573                                            "supported: %G",
1574                                            static_cast <gimple *> (phi));
1575         }
1576
1577       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1578            gsi_next (&si))
1579         {
1580           gimple *stmt = gsi_stmt (si);
1581           if (!gimple_clobber_p (stmt))
1582             {
1583               opt_result res
1584                 = vect_analyze_stmt (loop_vinfo->lookup_stmt (stmt),
1585                                      &need_to_vectorize,
1586                                      NULL, NULL, &cost_vec);
1587               if (!res)
1588                 return res;
1589             }
1590         }
1591     } /* bbs */
1592
1593   add_stmt_costs (loop_vinfo->target_cost_data, &cost_vec);
1594
1595   /* All operations in the loop are either irrelevant (deal with loop
1596      control, or dead), or only used outside the loop and can be moved
1597      out of the loop (e.g. invariants, inductions).  The loop can be
1598      optimized away by scalar optimizations.  We're better off not
1599      touching this loop.  */
1600   if (!need_to_vectorize)
1601     {
1602       if (dump_enabled_p ())
1603         dump_printf_loc (MSG_NOTE, vect_location,
1604                          "All the computation can be taken out of the loop.\n");
1605       return opt_result::failure_at
1606         (vect_location,
1607          "not vectorized: redundant loop. no profit to vectorize.\n");
1608     }
1609
1610   return opt_result::success ();
1611 }
1612
1613 /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
1614    is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
1615    definitely no, or -1 if it's worth retrying.  */
1616
1617 static int
1618 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1619 {
1620   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1621   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1622
1623   /* Only fully-masked loops can have iteration counts less than the
1624      vectorization factor.  */
1625   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1626     {
1627       HOST_WIDE_INT max_niter;
1628
1629       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1630         max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1631       else
1632         max_niter = max_stmt_executions_int (loop);
1633
1634       if (max_niter != -1
1635           && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1636         {
1637           if (dump_enabled_p ())
1638             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1639                              "not vectorized: iteration count smaller than "
1640                              "vectorization factor.\n");
1641           return 0;
1642         }
1643     }
1644
1645   int min_profitable_iters, min_profitable_estimate;
1646   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1647                                       &min_profitable_estimate);
1648
1649   if (min_profitable_iters < 0)
1650     {
1651       if (dump_enabled_p ())
1652         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1653                          "not vectorized: vectorization not profitable.\n");
1654       if (dump_enabled_p ())
1655         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1656                          "not vectorized: vector version will never be "
1657                          "profitable.\n");
1658       return -1;
1659     }
1660
1661   int min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1662                                * assumed_vf);
1663
1664   /* Use the cost model only if it is more conservative than user specified
1665      threshold.  */
1666   unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1667                                     min_profitable_iters);
1668
1669   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1670
1671   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1672       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1673     {
1674       if (dump_enabled_p ())
1675         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1676                          "not vectorized: vectorization not profitable.\n");
1677       if (dump_enabled_p ())
1678         dump_printf_loc (MSG_NOTE, vect_location,
1679                          "not vectorized: iteration count smaller than user "
1680                          "specified loop bound parameter or minimum profitable "
1681                          "iterations (whichever is more conservative).\n");
1682       return 0;
1683     }
1684
1685   HOST_WIDE_INT estimated_niter = estimated_stmt_executions_int (loop);
1686   if (estimated_niter == -1)
1687     estimated_niter = likely_max_stmt_executions_int (loop);
1688   if (estimated_niter != -1
1689       && ((unsigned HOST_WIDE_INT) estimated_niter
1690           < MAX (th, (unsigned) min_profitable_estimate)))
1691     {
1692       if (dump_enabled_p ())
1693         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1694                          "not vectorized: estimated iteration count too "
1695                          "small.\n");
1696       if (dump_enabled_p ())
1697         dump_printf_loc (MSG_NOTE, vect_location,
1698                          "not vectorized: estimated iteration count smaller "
1699                          "than specified loop bound parameter or minimum "
1700                          "profitable iterations (whichever is more "
1701                          "conservative).\n");
1702       return -1;
1703     }
1704
1705   return 1;
1706 }
1707
1708 static opt_result
1709 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1710                            vec<data_reference_p> *datarefs,
1711                            unsigned int *n_stmts)
1712 {
1713   *n_stmts = 0;
1714   for (unsigned i = 0; i < loop->num_nodes; i++)
1715     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1716          !gsi_end_p (gsi); gsi_next (&gsi))
1717       {
1718         gimple *stmt = gsi_stmt (gsi);
1719         if (is_gimple_debug (stmt))
1720           continue;
1721         ++(*n_stmts);
1722         opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs);
1723         if (!res)
1724           {
1725             if (is_gimple_call (stmt) && loop->safelen)
1726               {
1727                 tree fndecl = gimple_call_fndecl (stmt), op;
1728                 if (fndecl != NULL_TREE)
1729                   {
1730                     cgraph_node *node = cgraph_node::get (fndecl);
1731                     if (node != NULL && node->simd_clones != NULL)
1732                       {
1733                         unsigned int j, n = gimple_call_num_args (stmt);
1734                         for (j = 0; j < n; j++)
1735                           {
1736                             op = gimple_call_arg (stmt, j);
1737                             if (DECL_P (op)
1738                                 || (REFERENCE_CLASS_P (op)
1739                                     && get_base_address (op)))
1740                               break;
1741                           }
1742                         op = gimple_call_lhs (stmt);
1743                         /* Ignore #pragma omp declare simd functions
1744                            if they don't have data references in the
1745                            call stmt itself.  */
1746                         if (j == n
1747                             && !(op
1748                                  && (DECL_P (op)
1749                                      || (REFERENCE_CLASS_P (op)
1750                                          && get_base_address (op)))))
1751                           continue;
1752                       }
1753                   }
1754               }
1755             return res;
1756           }
1757         /* If dependence analysis will give up due to the limit on the
1758            number of datarefs stop here and fail fatally.  */
1759         if (datarefs->length ()
1760             > (unsigned)PARAM_VALUE (PARAM_LOOP_MAX_DATAREFS_FOR_DATADEPS))
1761           return opt_result::failure_at (stmt, "exceeded param "
1762                                          "loop-max-datarefs-for-datadeps\n");
1763       }
1764   return opt_result::success ();
1765 }
1766
1767 /* Look for SLP-only access groups and turn each individual access into its own
1768    group.  */
1769 static void
1770 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
1771 {
1772   unsigned int i;
1773   struct data_reference *dr;
1774
1775   DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
1776
1777   vec<data_reference_p> datarefs = loop_vinfo->shared->datarefs;
1778   FOR_EACH_VEC_ELT (datarefs, i, dr)
1779     {
1780       gcc_assert (DR_REF (dr));
1781       stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
1782
1783       /* Check if the load is a part of an interleaving chain.  */
1784       if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1785         {
1786           stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
1787           unsigned int group_size = DR_GROUP_SIZE (first_element);
1788
1789           /* Check if SLP-only groups.  */
1790           if (!STMT_SLP_TYPE (stmt_info)
1791               && STMT_VINFO_SLP_VECT_ONLY (first_element))
1792             {
1793               /* Dissolve the group.  */
1794               STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
1795
1796               stmt_vec_info vinfo = first_element;
1797               while (vinfo)
1798                 {
1799                   stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
1800                   DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
1801                   DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
1802                   DR_GROUP_SIZE (vinfo) = 1;
1803                   DR_GROUP_GAP (vinfo) = group_size - 1;
1804                   vinfo = next;
1805                 }
1806             }
1807         }
1808     }
1809 }
1810
1811
1812 /* Decides whether we need to create an epilogue loop to handle
1813    remaining scalar iterations and sets PEELING_FOR_NITERS accordingly.  */
1814
1815 void
1816 determine_peel_for_niter (loop_vec_info loop_vinfo)
1817 {
1818   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
1819
1820   unsigned HOST_WIDE_INT const_vf;
1821   HOST_WIDE_INT max_niter
1822     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1823
1824   unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1825   if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1826     th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1827                                           (loop_vinfo));
1828
1829   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1830     /* The main loop handles all iterations.  */
1831     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
1832   else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1833            && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1834     {
1835       /* Work out the (constant) number of iterations that need to be
1836          peeled for reasons other than niters.  */
1837       unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1838       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1839         peel_niter += 1;
1840       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1841                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1842         LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1843     }
1844   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1845            /* ??? When peeling for gaps but not alignment, we could
1846               try to check whether the (variable) niters is known to be
1847               VF * N + 1.  That's something of a niche case though.  */
1848            || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1849            || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1850            || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1851                 < (unsigned) exact_log2 (const_vf))
1852                /* In case of versioning, check if the maximum number of
1853                   iterations is greater than th.  If they are identical,
1854                   the epilogue is unnecessary.  */
1855                && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1856                    || ((unsigned HOST_WIDE_INT) max_niter
1857                        > (th / const_vf) * const_vf))))
1858     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1859 }
1860
1861
1862 /* Function vect_analyze_loop_2.
1863
1864    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1865    for it.  The different analyses will record information in the
1866    loop_vec_info struct.  */
1867 static opt_result
1868 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
1869 {
1870   opt_result ok = opt_result::success ();
1871   int res;
1872   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
1873   poly_uint64 min_vf = 2;
1874
1875   /* The first group of checks is independent of the vector size.  */
1876   fatal = true;
1877
1878   if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
1879       && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
1880     return opt_result::failure_at (vect_location,
1881                                    "not vectorized: simd if(0)\n");
1882
1883   /* Find all data references in the loop (which correspond to vdefs/vuses)
1884      and analyze their evolution in the loop.  */
1885
1886   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1887
1888   /* Gather the data references and count stmts in the loop.  */
1889   if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
1890     {
1891       opt_result res
1892         = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
1893                                      &LOOP_VINFO_DATAREFS (loop_vinfo),
1894                                      n_stmts);
1895       if (!res)
1896         {
1897           if (dump_enabled_p ())
1898             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1899                              "not vectorized: loop contains function "
1900                              "calls or data references that cannot "
1901                              "be analyzed\n");
1902           return res;
1903         }
1904       loop_vinfo->shared->save_datarefs ();
1905     }
1906   else
1907     loop_vinfo->shared->check_datarefs ();
1908
1909   /* Analyze the data references and also adjust the minimal
1910      vectorization factor according to the loads and stores.  */
1911
1912   ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
1913   if (!ok)
1914     {
1915       if (dump_enabled_p ())
1916         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1917                          "bad data references.\n");
1918       return ok;
1919     }
1920
1921   /* Classify all cross-iteration scalar data-flow cycles.
1922      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1923   vect_analyze_scalar_cycles (loop_vinfo);
1924
1925   vect_pattern_recog (loop_vinfo);
1926
1927   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1928
1929   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1930      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1931
1932   ok = vect_analyze_data_ref_accesses (loop_vinfo);
1933   if (!ok)
1934     {
1935       if (dump_enabled_p ())
1936         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1937                          "bad data access.\n");
1938       return ok;
1939     }
1940
1941   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1942
1943   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
1944   if (!ok)
1945     {
1946       if (dump_enabled_p ())
1947         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1948                          "unexpected pattern.\n");
1949       return ok;
1950     }
1951
1952   /* While the rest of the analysis below depends on it in some way.  */
1953   fatal = false;
1954
1955   /* Analyze data dependences between the data-refs in the loop
1956      and adjust the maximum vectorization factor according to
1957      the dependences.
1958      FORNOW: fail at the first data dependence that we encounter.  */
1959
1960   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1961   if (!ok)
1962     {
1963       if (dump_enabled_p ())
1964         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1965                          "bad data dependence.\n");
1966       return ok;
1967     }
1968   if (max_vf != MAX_VECTORIZATION_FACTOR
1969       && maybe_lt (max_vf, min_vf))
1970     return opt_result::failure_at (vect_location, "bad data dependence.\n");
1971   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
1972
1973   ok = vect_determine_vectorization_factor (loop_vinfo);
1974   if (!ok)
1975     {
1976       if (dump_enabled_p ())
1977         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1978                          "can't determine vectorization factor.\n");
1979       return ok;
1980     }
1981   if (max_vf != MAX_VECTORIZATION_FACTOR
1982       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1983     return opt_result::failure_at (vect_location, "bad data dependence.\n");
1984
1985   /* Compute the scalar iteration cost.  */
1986   vect_compute_single_scalar_iteration_cost (loop_vinfo);
1987
1988   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1989
1990   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
1991   ok = vect_analyze_slp (loop_vinfo, *n_stmts);
1992   if (!ok)
1993     return ok;
1994
1995   /* If there are any SLP instances mark them as pure_slp.  */
1996   bool slp = vect_make_slp_decision (loop_vinfo);
1997   if (slp)
1998     {
1999       /* Find stmts that need to be both vectorized and SLPed.  */
2000       vect_detect_hybrid_slp (loop_vinfo);
2001
2002       /* Update the vectorization factor based on the SLP decision.  */
2003       vect_update_vf_for_slp (loop_vinfo);
2004     }
2005
2006   bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo);
2007
2008   /* We don't expect to have to roll back to anything other than an empty
2009      set of rgroups.  */
2010   gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2011
2012   /* This is the point where we can re-start analysis with SLP forced off.  */
2013 start_over:
2014
2015   /* Now the vectorization factor is final.  */
2016   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2017   gcc_assert (known_ne (vectorization_factor, 0U));
2018
2019   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2020     {
2021       dump_printf_loc (MSG_NOTE, vect_location,
2022                        "vectorization_factor = ");
2023       dump_dec (MSG_NOTE, vectorization_factor);
2024       dump_printf (MSG_NOTE, ", niters = %wd\n",
2025                    LOOP_VINFO_INT_NITERS (loop_vinfo));
2026     }
2027
2028   /* Analyze the alignment of the data-refs in the loop.
2029      Fail if a data reference is found that cannot be vectorized.  */
2030
2031   ok = vect_analyze_data_refs_alignment (loop_vinfo);
2032   if (!ok)
2033     {
2034       if (dump_enabled_p ())
2035         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2036                          "bad data alignment.\n");
2037       return ok;
2038     }
2039
2040   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2041      It is important to call pruning after vect_analyze_data_ref_accesses,
2042      since we use grouping information gathered by interleaving analysis.  */
2043   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2044   if (!ok)
2045     return ok;
2046
2047   /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2048      vectorization, since we do not want to add extra peeling or
2049      add versioning for alignment.  */
2050   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2051     /* This pass will decide on using loop versioning and/or loop peeling in
2052        order to enhance the alignment of data references in the loop.  */
2053     ok = vect_enhance_data_refs_alignment (loop_vinfo);
2054   else
2055     ok = vect_verify_datarefs_alignment (loop_vinfo);
2056   if (!ok)
2057     return ok;
2058
2059   if (slp)
2060     {
2061       /* Analyze operations in the SLP instances.  Note this may
2062          remove unsupported SLP instances which makes the above
2063          SLP kind detection invalid.  */
2064       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2065       vect_slp_analyze_operations (loop_vinfo);
2066       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2067         {
2068           ok = opt_result::failure_at (vect_location,
2069                                        "unsupported SLP instances\n");
2070           goto again;
2071         }
2072     }
2073
2074   /* Dissolve SLP-only groups.  */
2075   vect_dissolve_slp_only_groups (loop_vinfo);
2076
2077   /* Scan all the remaining operations in the loop that are not subject
2078      to SLP and make sure they are vectorizable.  */
2079   ok = vect_analyze_loop_operations (loop_vinfo);
2080   if (!ok)
2081     {
2082       if (dump_enabled_p ())
2083         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2084                          "bad operation or unsupported loop bound.\n");
2085       return ok;
2086     }
2087
2088   /* Decide whether to use a fully-masked loop for this vectorization
2089      factor.  */
2090   LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2091     = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
2092        && vect_verify_full_masking (loop_vinfo));
2093   if (dump_enabled_p ())
2094     {
2095       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2096         dump_printf_loc (MSG_NOTE, vect_location,
2097                          "using a fully-masked loop.\n");
2098       else
2099         dump_printf_loc (MSG_NOTE, vect_location,
2100                          "not using a fully-masked loop.\n");
2101     }
2102
2103   /* If epilog loop is required because of data accesses with gaps,
2104      one additional iteration needs to be peeled.  Check if there is
2105      enough iterations for vectorization.  */
2106   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2107       && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2108       && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2109     {
2110       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2111       tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2112
2113       if (known_lt (wi::to_widest (scalar_niters), vf))
2114         return opt_result::failure_at (vect_location,
2115                                        "loop has no enough iterations to"
2116                                        " support peeling for gaps.\n");
2117     }
2118
2119   /* Check the costings of the loop make vectorizing worthwhile.  */
2120   res = vect_analyze_loop_costing (loop_vinfo);
2121   if (res < 0)
2122     {
2123       ok = opt_result::failure_at (vect_location,
2124                                    "Loop costings may not be worthwhile.\n");
2125       goto again;
2126     }
2127   if (!res)
2128     return opt_result::failure_at (vect_location,
2129                                    "Loop costings not worthwhile.\n");
2130
2131   determine_peel_for_niter (loop_vinfo);
2132   /* If an epilogue loop is required make sure we can create one.  */
2133   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2134       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2135     {
2136       if (dump_enabled_p ())
2137         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2138       if (!vect_can_advance_ivs_p (loop_vinfo)
2139           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2140                                            single_exit (LOOP_VINFO_LOOP
2141                                                          (loop_vinfo))))
2142         {
2143           ok = opt_result::failure_at (vect_location,
2144                                        "not vectorized: can't create required "
2145                                        "epilog loop\n");
2146           goto again;
2147         }
2148     }
2149
2150   /* During peeling, we need to check if number of loop iterations is
2151      enough for both peeled prolog loop and vector loop.  This check
2152      can be merged along with threshold check of loop versioning, so
2153      increase threshold for this case if necessary.  */
2154   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
2155     {
2156       poly_uint64 niters_th = 0;
2157       unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2158
2159       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2160         {
2161           /* Niters for peeled prolog loop.  */
2162           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2163             {
2164               dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2165               tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2166               niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2167             }
2168           else
2169             niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2170         }
2171
2172       /* Niters for at least one iteration of vectorized loop.  */
2173       if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2174         niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2175       /* One additional iteration because of peeling for gap.  */
2176       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2177         niters_th += 1;
2178
2179       /*  Use the same condition as vect_transform_loop to decide when to use
2180           the cost to determine a versioning threshold.  */
2181       if (th >= vect_vf_for_cost (loop_vinfo)
2182           && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2183           && ordered_p (th, niters_th))
2184         niters_th = ordered_max (poly_uint64 (th), niters_th);
2185
2186       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2187     }
2188
2189   gcc_assert (known_eq (vectorization_factor,
2190                         LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2191
2192   /* Ok to vectorize!  */
2193   return opt_result::success ();
2194
2195 again:
2196   /* Ensure that "ok" is false (with an opt_problem if dumping is enabled).  */
2197   gcc_assert (!ok);
2198
2199   /* Try again with SLP forced off but if we didn't do any SLP there is
2200      no point in re-trying.  */
2201   if (!slp)
2202     return ok;
2203
2204   /* If there are reduction chains re-trying will fail anyway.  */
2205   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2206     return ok;
2207
2208   /* Likewise if the grouped loads or stores in the SLP cannot be handled
2209      via interleaving or lane instructions.  */
2210   slp_instance instance;
2211   slp_tree node;
2212   unsigned i, j;
2213   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2214     {
2215       stmt_vec_info vinfo;
2216       vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2217       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2218         continue;
2219       vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2220       unsigned int size = DR_GROUP_SIZE (vinfo);
2221       tree vectype = STMT_VINFO_VECTYPE (vinfo);
2222       if (! vect_store_lanes_supported (vectype, size, false)
2223          && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2224          && ! vect_grouped_store_supported (vectype, size))
2225         return opt_result::failure_at (vinfo->stmt,
2226                                        "unsupported grouped store\n");
2227       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2228         {
2229           vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
2230           vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2231           bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2232           size = DR_GROUP_SIZE (vinfo);
2233           vectype = STMT_VINFO_VECTYPE (vinfo);
2234           if (! vect_load_lanes_supported (vectype, size, false)
2235               && ! vect_grouped_load_supported (vectype, single_element_p,
2236                                                 size))
2237             return opt_result::failure_at (vinfo->stmt,
2238                                            "unsupported grouped load\n");
2239         }
2240     }
2241
2242   if (dump_enabled_p ())
2243     dump_printf_loc (MSG_NOTE, vect_location,
2244                      "re-trying with SLP disabled\n");
2245
2246   /* Roll back state appropriately.  No SLP this time.  */
2247   slp = false;
2248   /* Restore vectorization factor as it were without SLP.  */
2249   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2250   /* Free the SLP instances.  */
2251   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2252     vect_free_slp_instance (instance, false);
2253   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2254   /* Reset SLP type to loop_vect on all stmts.  */
2255   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2256     {
2257       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2258       for (gimple_stmt_iterator si = gsi_start_phis (bb);
2259            !gsi_end_p (si); gsi_next (&si))
2260         {
2261           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2262           STMT_SLP_TYPE (stmt_info) = loop_vect;
2263           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2264               || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
2265             {
2266               /* vectorizable_reduction adjusts reduction stmt def-types,
2267                  restore them to that of the PHI.  */
2268               STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
2269                 = STMT_VINFO_DEF_TYPE (stmt_info);
2270               STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
2271                                         (STMT_VINFO_REDUC_DEF (stmt_info)))
2272                 = STMT_VINFO_DEF_TYPE (stmt_info);
2273             }
2274         }
2275       for (gimple_stmt_iterator si = gsi_start_bb (bb);
2276            !gsi_end_p (si); gsi_next (&si))
2277         {
2278           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2279           STMT_SLP_TYPE (stmt_info) = loop_vect;
2280           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2281             {
2282               gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2283               stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
2284               STMT_SLP_TYPE (stmt_info) = loop_vect;
2285               for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2286                    !gsi_end_p (pi); gsi_next (&pi))
2287                 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2288                   = loop_vect;
2289             }
2290         }
2291     }
2292   /* Free optimized alias test DDRS.  */
2293   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2294   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2295   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2296   /* Reset target cost data.  */
2297   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2298   LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2299     = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2300   /* Reset accumulated rgroup information.  */
2301   release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo));
2302   /* Reset assorted flags.  */
2303   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2304   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2305   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2306   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2307   LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p;
2308
2309   goto start_over;
2310 }
2311
2312 /* Function vect_analyze_loop.
2313
2314    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2315    for it.  The different analyses will record information in the
2316    loop_vec_info struct.  If ORIG_LOOP_VINFO is not NULL epilogue must
2317    be vectorized.  */
2318 opt_loop_vec_info
2319 vect_analyze_loop (class loop *loop, loop_vec_info orig_loop_vinfo,
2320                    vec_info_shared *shared)
2321 {
2322   auto_vector_sizes vector_sizes;
2323
2324   /* Autodetect first vector size we try.  */
2325   targetm.vectorize.autovectorize_vector_sizes (&vector_sizes,
2326                                                 loop->simdlen != 0);
2327   unsigned int next_size = 0;
2328
2329   DUMP_VECT_SCOPE ("analyze_loop_nest");
2330
2331   if (loop_outer (loop)
2332       && loop_vec_info_for_loop (loop_outer (loop))
2333       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2334     return opt_loop_vec_info::failure_at (vect_location,
2335                                           "outer-loop already vectorized.\n");
2336
2337   if (!find_loop_nest (loop, &shared->loop_nest))
2338     return opt_loop_vec_info::failure_at
2339       (vect_location,
2340        "not vectorized: loop nest containing two or more consecutive inner"
2341        " loops cannot be vectorized\n");
2342
2343   unsigned n_stmts = 0;
2344   poly_uint64 autodetected_vector_size = 0;
2345   opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
2346   poly_uint64 next_vector_size = 0;
2347   while (1)
2348     {
2349       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
2350       opt_loop_vec_info loop_vinfo
2351         = vect_analyze_loop_form (loop, shared);
2352       if (!loop_vinfo)
2353         {
2354           if (dump_enabled_p ())
2355             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2356                              "bad loop form.\n");
2357           gcc_checking_assert (first_loop_vinfo == NULL);
2358           return loop_vinfo;
2359         }
2360       loop_vinfo->vector_size = next_vector_size;
2361
2362       bool fatal = false;
2363
2364       if (orig_loop_vinfo)
2365         LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo;
2366
2367       opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts);
2368       if (next_size == 0)
2369         autodetected_vector_size = loop_vinfo->vector_size;
2370
2371       if (res)
2372         {
2373           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2374
2375           if (loop->simdlen
2376               && maybe_ne (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2377                            (unsigned HOST_WIDE_INT) loop->simdlen))
2378             {
2379               if (first_loop_vinfo == NULL)
2380                 {
2381                   first_loop_vinfo = loop_vinfo;
2382                   loop->aux = NULL;
2383                 }
2384               else
2385                 delete loop_vinfo;
2386             }
2387           else
2388             {
2389               delete first_loop_vinfo;
2390               return loop_vinfo;
2391             }
2392         }
2393       else
2394         delete loop_vinfo;
2395
2396       if (fatal)
2397         {
2398           gcc_checking_assert (first_loop_vinfo == NULL);
2399           return opt_loop_vec_info::propagate_failure (res);
2400         }
2401
2402       if (next_size < vector_sizes.length ()
2403           && known_eq (vector_sizes[next_size], autodetected_vector_size))
2404         next_size += 1;
2405
2406       if (next_size == vector_sizes.length ()
2407           || known_eq (autodetected_vector_size, 0U))
2408         {
2409           if (first_loop_vinfo)
2410             {
2411               loop->aux = (loop_vec_info) first_loop_vinfo;
2412               if (dump_enabled_p ())
2413                 {
2414                   dump_printf_loc (MSG_NOTE, vect_location,
2415                                    "***** Choosing vector size ");
2416                   dump_dec (MSG_NOTE, first_loop_vinfo->vector_size);
2417                   dump_printf (MSG_NOTE, "\n");
2418                 }
2419               return first_loop_vinfo;
2420             }
2421           else
2422             return opt_loop_vec_info::propagate_failure (res);
2423         }
2424
2425       /* Try the next biggest vector size.  */
2426       next_vector_size = vector_sizes[next_size++];
2427       if (dump_enabled_p ())
2428         {
2429           dump_printf_loc (MSG_NOTE, vect_location,
2430                            "***** Re-trying analysis with "
2431                            "vector size ");
2432           dump_dec (MSG_NOTE, next_vector_size);
2433           dump_printf (MSG_NOTE, "\n");
2434         }
2435     }
2436 }
2437
2438 /* Return true if there is an in-order reduction function for CODE, storing
2439    it in *REDUC_FN if so.  */
2440
2441 static bool
2442 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
2443 {
2444   switch (code)
2445     {
2446     case PLUS_EXPR:
2447       *reduc_fn = IFN_FOLD_LEFT_PLUS;
2448       return true;
2449
2450     default:
2451       return false;
2452     }
2453 }
2454
2455 /* Function reduction_fn_for_scalar_code
2456
2457    Input:
2458    CODE - tree_code of a reduction operations.
2459
2460    Output:
2461    REDUC_FN - the corresponding internal function to be used to reduce the
2462       vector of partial results into a single scalar result, or IFN_LAST
2463       if the operation is a supported reduction operation, but does not have
2464       such an internal function.
2465
2466    Return FALSE if CODE currently cannot be vectorized as reduction.  */
2467
2468 static bool
2469 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
2470 {
2471   switch (code)
2472     {
2473       case MAX_EXPR:
2474         *reduc_fn = IFN_REDUC_MAX;
2475         return true;
2476
2477       case MIN_EXPR:
2478         *reduc_fn = IFN_REDUC_MIN;
2479         return true;
2480
2481       case PLUS_EXPR:
2482         *reduc_fn = IFN_REDUC_PLUS;
2483         return true;
2484
2485       case BIT_AND_EXPR:
2486         *reduc_fn = IFN_REDUC_AND;
2487         return true;
2488
2489       case BIT_IOR_EXPR:
2490         *reduc_fn = IFN_REDUC_IOR;
2491         return true;
2492
2493       case BIT_XOR_EXPR:
2494         *reduc_fn = IFN_REDUC_XOR;
2495         return true;
2496
2497       case MULT_EXPR:
2498       case MINUS_EXPR:
2499         *reduc_fn = IFN_LAST;
2500         return true;
2501
2502       default:
2503        return false;
2504     }
2505 }
2506
2507 /* If there is a neutral value X such that SLP reduction NODE would not
2508    be affected by the introduction of additional X elements, return that X,
2509    otherwise return null.  CODE is the code of the reduction.  REDUC_CHAIN
2510    is true if the SLP statements perform a single reduction, false if each
2511    statement performs an independent reduction.  */
2512
2513 static tree
2514 neutral_op_for_slp_reduction (slp_tree slp_node, tree_code code,
2515                               bool reduc_chain)
2516 {
2517   vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2518   stmt_vec_info stmt_vinfo = stmts[0];
2519   tree vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
2520   tree scalar_type = TREE_TYPE (vector_type);
2521   class loop *loop = gimple_bb (stmt_vinfo->stmt)->loop_father;
2522   gcc_assert (loop);
2523
2524   switch (code)
2525     {
2526     case WIDEN_SUM_EXPR:
2527     case DOT_PROD_EXPR:
2528     case SAD_EXPR:
2529     case PLUS_EXPR:
2530     case MINUS_EXPR:
2531     case BIT_IOR_EXPR:
2532     case BIT_XOR_EXPR:
2533       return build_zero_cst (scalar_type);
2534
2535     case MULT_EXPR:
2536       return build_one_cst (scalar_type);
2537
2538     case BIT_AND_EXPR:
2539       return build_all_ones_cst (scalar_type);
2540
2541     case MAX_EXPR:
2542     case MIN_EXPR:
2543       /* For MIN/MAX the initial values are neutral.  A reduction chain
2544          has only a single initial value, so that value is neutral for
2545          all statements.  */
2546       if (reduc_chain)
2547         return PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt,
2548                                       loop_preheader_edge (loop));
2549       return NULL_TREE;
2550
2551     default:
2552       return NULL_TREE;
2553     }
2554 }
2555
2556 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
2557    STMT is printed with a message MSG. */
2558
2559 static void
2560 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2561 {
2562   dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
2563 }
2564
2565 /* Return true if we need an in-order reduction for operation CODE
2566    on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
2567    overflow must wrap.  */
2568
2569 bool
2570 needs_fold_left_reduction_p (tree type, tree_code code)
2571 {
2572   /* CHECKME: check for !flag_finite_math_only too?  */
2573   if (SCALAR_FLOAT_TYPE_P (type))
2574     switch (code)
2575       {
2576       case MIN_EXPR:
2577       case MAX_EXPR:
2578         return false;
2579
2580       default:
2581         return !flag_associative_math;
2582       }
2583
2584   if (INTEGRAL_TYPE_P (type))
2585     {
2586       if (!operation_no_trapping_overflow (type, code))
2587         return true;
2588       return false;
2589     }
2590
2591   if (SAT_FIXED_POINT_TYPE_P (type))
2592     return true;
2593
2594   return false;
2595 }
2596
2597 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
2598    has a handled computation expression.  Store the main reduction
2599    operation in *CODE.  */
2600
2601 static bool
2602 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
2603                       tree loop_arg, enum tree_code *code,
2604                       vec<std::pair<ssa_op_iter, use_operand_p> > &path)
2605 {
2606   auto_bitmap visited;
2607   tree lookfor = PHI_RESULT (phi);
2608   ssa_op_iter curri;
2609   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
2610   while (USE_FROM_PTR (curr) != loop_arg)
2611     curr = op_iter_next_use (&curri);
2612   curri.i = curri.numops;
2613   do
2614     {
2615       path.safe_push (std::make_pair (curri, curr));
2616       tree use = USE_FROM_PTR (curr);
2617       if (use == lookfor)
2618         break;
2619       gimple *def = SSA_NAME_DEF_STMT (use);
2620       if (gimple_nop_p (def)
2621           || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
2622         {
2623 pop:
2624           do
2625             {
2626               std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
2627               curri = x.first;
2628               curr = x.second;
2629               do
2630                 curr = op_iter_next_use (&curri);
2631               /* Skip already visited or non-SSA operands (from iterating
2632                  over PHI args).  */
2633               while (curr != NULL_USE_OPERAND_P
2634                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2635                          || ! bitmap_set_bit (visited,
2636                                               SSA_NAME_VERSION
2637                                                 (USE_FROM_PTR (curr)))));
2638             }
2639           while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
2640           if (curr == NULL_USE_OPERAND_P)
2641             break;
2642         }
2643       else
2644         {
2645           if (gimple_code (def) == GIMPLE_PHI)
2646             curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
2647           else
2648             curr = op_iter_init_use (&curri, def, SSA_OP_USE);
2649           while (curr != NULL_USE_OPERAND_P
2650                  && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2651                      || ! bitmap_set_bit (visited,
2652                                           SSA_NAME_VERSION
2653                                             (USE_FROM_PTR (curr)))))
2654             curr = op_iter_next_use (&curri);
2655           if (curr == NULL_USE_OPERAND_P)
2656             goto pop;
2657         }
2658     }
2659   while (1);
2660   if (dump_file && (dump_flags & TDF_DETAILS))
2661     {
2662       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
2663       unsigned i;
2664       std::pair<ssa_op_iter, use_operand_p> *x;
2665       FOR_EACH_VEC_ELT (path, i, x)
2666         dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
2667       dump_printf (MSG_NOTE, "\n");
2668     }
2669
2670   /* Check whether the reduction path detected is valid.  */
2671   bool fail = path.length () == 0;
2672   bool neg = false;
2673   *code = ERROR_MARK;
2674   for (unsigned i = 1; i < path.length (); ++i)
2675     {
2676       gimple *use_stmt = USE_STMT (path[i].second);
2677       tree op = USE_FROM_PTR (path[i].second);
2678       if (! has_single_use (op)
2679           || ! is_gimple_assign (use_stmt)
2680           /* The following make sure we can compute the operand index
2681              easily plus it mostly disallows chaining via COND_EXPR condition
2682              operands.  */
2683           || (gimple_assign_rhs1 (use_stmt) != op
2684               && gimple_assign_rhs2 (use_stmt) != op
2685               && gimple_assign_rhs3 (use_stmt) != op))
2686         {
2687           fail = true;
2688           break;
2689         }
2690       enum tree_code use_code = gimple_assign_rhs_code (use_stmt);
2691       if (use_code == MINUS_EXPR)
2692         {
2693           use_code = PLUS_EXPR;
2694           /* Track whether we negate the reduction value each iteration.  */
2695           if (gimple_assign_rhs2 (use_stmt) == op)
2696             neg = ! neg;
2697         }
2698       if (CONVERT_EXPR_CODE_P (use_code)
2699           && tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (use_stmt)),
2700                                     TREE_TYPE (gimple_assign_rhs1 (use_stmt))))
2701         ;
2702       else if (*code == ERROR_MARK)
2703         *code = use_code;
2704       else if (use_code != *code)
2705         {
2706           fail = true;
2707           break;
2708         }
2709     }
2710   return ! fail && ! neg && *code != ERROR_MARK;
2711 }
2712
2713 bool
2714 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
2715                       tree loop_arg, enum tree_code code)
2716 {
2717   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
2718   enum tree_code code_;
2719   return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
2720           && code_ == code);
2721 }
2722
2723
2724
2725 /* Function vect_is_simple_reduction
2726
2727    (1) Detect a cross-iteration def-use cycle that represents a simple
2728    reduction computation.  We look for the following pattern:
2729
2730    loop_header:
2731      a1 = phi < a0, a2 >
2732      a3 = ...
2733      a2 = operation (a3, a1)
2734
2735    or
2736
2737    a3 = ...
2738    loop_header:
2739      a1 = phi < a0, a2 >
2740      a2 = operation (a3, a1)
2741
2742    such that:
2743    1. operation is commutative and associative and it is safe to
2744       change the order of the computation
2745    2. no uses for a2 in the loop (a2 is used out of the loop)
2746    3. no uses of a1 in the loop besides the reduction operation
2747    4. no uses of a1 outside the loop.
2748
2749    Conditions 1,4 are tested here.
2750    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2751
2752    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2753    nested cycles.
2754
2755    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2756    reductions:
2757
2758      a1 = phi < a0, a2 >
2759      inner loop (def of a3)
2760      a2 = phi < a3 >
2761
2762    (4) Detect condition expressions, ie:
2763      for (int i = 0; i < N; i++)
2764        if (a[i] < val)
2765         ret_val = a[i];
2766
2767 */
2768
2769 static stmt_vec_info
2770 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
2771                           bool *double_reduc)
2772 {
2773   gphi *phi = as_a <gphi *> (phi_info->stmt);
2774   gimple *phi_use_stmt = NULL;
2775   imm_use_iterator imm_iter;
2776   use_operand_p use_p;
2777
2778   *double_reduc = false;
2779   STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
2780
2781   tree phi_name = PHI_RESULT (phi);
2782   /* ???  If there are no uses of the PHI result the inner loop reduction
2783      won't be detected as possibly double-reduction by vectorizable_reduction
2784      because that tries to walk the PHI arg from the preheader edge which
2785      can be constant.  See PR60382.  */
2786   if (has_zero_uses (phi_name))
2787     return NULL;
2788   class loop *loop = (gimple_bb (phi))->loop_father;
2789   unsigned nphi_def_loop_uses = 0;
2790   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
2791     {
2792       gimple *use_stmt = USE_STMT (use_p);
2793       if (is_gimple_debug (use_stmt))
2794         continue;
2795
2796       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2797         {
2798           if (dump_enabled_p ())
2799             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2800                              "intermediate value used outside loop.\n");
2801
2802           return NULL;
2803         }
2804
2805       nphi_def_loop_uses++;
2806       phi_use_stmt = use_stmt;
2807     }
2808
2809   tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
2810   if (TREE_CODE (latch_def) != SSA_NAME)
2811     {
2812       if (dump_enabled_p ())
2813         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2814                          "reduction: not ssa_name: %T\n", latch_def);
2815       return NULL;
2816     }
2817
2818   stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
2819   if (!def_stmt_info
2820       || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
2821     return NULL;
2822
2823   bool nested_in_vect_loop
2824     = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
2825   unsigned nlatch_def_loop_uses = 0;
2826   auto_vec<gphi *, 3> lcphis;
2827   bool inner_loop_of_double_reduc = false;
2828   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
2829     {
2830       gimple *use_stmt = USE_STMT (use_p);
2831       if (is_gimple_debug (use_stmt))
2832         continue;
2833       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2834         nlatch_def_loop_uses++;
2835       else
2836         {
2837           /* We can have more than one loop-closed PHI.  */
2838           lcphis.safe_push (as_a <gphi *> (use_stmt));
2839           if (nested_in_vect_loop
2840               && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
2841                   == vect_double_reduction_def))
2842             inner_loop_of_double_reduc = true;
2843         }
2844     }
2845
2846   /* If we are vectorizing an inner reduction we are executing that
2847      in the original order only in case we are not dealing with a
2848      double reduction.  */
2849   if (nested_in_vect_loop && !inner_loop_of_double_reduc)
2850     {
2851       if (dump_enabled_p ())
2852         report_vect_op (MSG_NOTE, def_stmt_info->stmt,
2853                         "detected nested cycle: ");
2854       return def_stmt_info;
2855     }
2856
2857   /* If this isn't a nested cycle or if the nested cycle reduction value
2858      is used ouside of the inner loop we cannot handle uses of the reduction
2859      value.  */
2860   if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
2861     {
2862       if (dump_enabled_p ())
2863         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2864                          "reduction used in loop.\n");
2865       return NULL;
2866     }
2867
2868   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2869      defined in the inner loop.  */
2870   if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
2871     {
2872       tree op1 = PHI_ARG_DEF (def_stmt, 0);
2873       if (gimple_phi_num_args (def_stmt) != 1
2874           || TREE_CODE (op1) != SSA_NAME)
2875         {
2876           if (dump_enabled_p ())
2877             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2878                              "unsupported phi node definition.\n");
2879
2880           return NULL;
2881         }
2882
2883       gimple *def1 = SSA_NAME_DEF_STMT (op1);
2884       if (gimple_bb (def1)
2885           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2886           && loop->inner
2887           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2888           && is_gimple_assign (def1)
2889           && is_a <gphi *> (phi_use_stmt)
2890           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
2891         {
2892           if (dump_enabled_p ())
2893             report_vect_op (MSG_NOTE, def_stmt,
2894                             "detected double reduction: ");
2895
2896           *double_reduc = true;
2897           return def_stmt_info;
2898         }
2899
2900       return NULL;
2901     }
2902
2903   /* Look for the expression computing latch_def from then loop PHI result.  */
2904   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
2905   enum tree_code code;
2906   if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
2907                             path))
2908     {
2909       STMT_VINFO_REDUC_CODE (phi_info) = code;
2910       if (code == COND_EXPR && !nested_in_vect_loop)
2911         STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
2912
2913       /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
2914          reduction chain for which the additional restriction is that
2915          all operations in the chain are the same.  */
2916       auto_vec<stmt_vec_info, 8> reduc_chain;
2917       unsigned i;
2918       bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
2919       for (i = path.length () - 1; i >= 1; --i)
2920         {
2921           gimple *stmt = USE_STMT (path[i].second);
2922           if (gimple_assign_rhs_code (stmt) != code)
2923             is_slp_reduc = false;
2924           stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
2925           STMT_VINFO_REDUC_IDX (stmt_info)
2926             = path[i].second->use - gimple_assign_rhs1_ptr (stmt);
2927           reduc_chain.safe_push (stmt_info);
2928         }
2929       if (is_slp_reduc && reduc_chain.length () > 1)
2930         {
2931           for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
2932             {
2933               REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
2934               REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
2935             }
2936           REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
2937           REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
2938
2939           /* Save the chain for further analysis in SLP detection.  */
2940           LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
2941           REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
2942
2943           if (dump_enabled_p ())
2944             dump_printf_loc (MSG_NOTE, vect_location,
2945                             "reduction: detected reduction chain\n");
2946         }
2947       else if (dump_enabled_p ())
2948         dump_printf_loc (MSG_NOTE, vect_location,
2949                          "reduction: detected reduction\n");
2950
2951       return def_stmt_info;
2952     }
2953
2954   if (dump_enabled_p ())
2955     dump_printf_loc (MSG_NOTE, vect_location,
2956                      "reduction: unknown pattern\n");
2957
2958   return NULL;
2959 }
2960
2961 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
2962 int
2963 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
2964                              int *peel_iters_epilogue,
2965                              stmt_vector_for_cost *scalar_cost_vec,
2966                              stmt_vector_for_cost *prologue_cost_vec,
2967                              stmt_vector_for_cost *epilogue_cost_vec)
2968 {
2969   int retval = 0;
2970   int assumed_vf = vect_vf_for_cost (loop_vinfo);
2971
2972   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2973     {
2974       *peel_iters_epilogue = assumed_vf / 2;
2975       if (dump_enabled_p ())
2976         dump_printf_loc (MSG_NOTE, vect_location,
2977                          "cost model: epilogue peel iters set to vf/2 "
2978                          "because loop iterations are unknown .\n");
2979
2980       /* If peeled iterations are known but number of scalar loop
2981          iterations are unknown, count a taken branch per peeled loop.  */
2982       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
2983                                  NULL, 0, vect_prologue);
2984       retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
2985                                   NULL, 0, vect_epilogue);
2986     }
2987   else
2988     {
2989       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
2990       peel_iters_prologue = niters < peel_iters_prologue ?
2991                             niters : peel_iters_prologue;
2992       *peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
2993       /* If we need to peel for gaps, but no peeling is required, we have to
2994          peel VF iterations.  */
2995       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
2996         *peel_iters_epilogue = assumed_vf;
2997     }
2998
2999   stmt_info_for_cost *si;
3000   int j;
3001   if (peel_iters_prologue)
3002     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3003       retval += record_stmt_cost (prologue_cost_vec,
3004                                   si->count * peel_iters_prologue,
3005                                   si->kind, si->stmt_info, si->misalign,
3006                                   vect_prologue);
3007   if (*peel_iters_epilogue)
3008     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3009       retval += record_stmt_cost (epilogue_cost_vec,
3010                                   si->count * *peel_iters_epilogue,
3011                                   si->kind, si->stmt_info, si->misalign,
3012                                   vect_epilogue);
3013
3014   return retval;
3015 }
3016
3017 /* Function vect_estimate_min_profitable_iters
3018
3019    Return the number of iterations required for the vector version of the
3020    loop to be profitable relative to the cost of the scalar version of the
3021    loop.
3022
3023    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3024    of iterations for vectorization.  -1 value means loop vectorization
3025    is not profitable.  This returned value may be used for dynamic
3026    profitability check.
3027
3028    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3029    for static check against estimated number of iterations.  */
3030
3031 static void
3032 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3033                                     int *ret_min_profitable_niters,
3034                                     int *ret_min_profitable_estimate)
3035 {
3036   int min_profitable_iters;
3037   int min_profitable_estimate;
3038   int peel_iters_prologue;
3039   int peel_iters_epilogue;
3040   unsigned vec_inside_cost = 0;
3041   int vec_outside_cost = 0;
3042   unsigned vec_prologue_cost = 0;
3043   unsigned vec_epilogue_cost = 0;
3044   int scalar_single_iter_cost = 0;
3045   int scalar_outside_cost = 0;
3046   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3047   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3048   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3049
3050   /* Cost model disabled.  */
3051   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3052     {
3053       if (dump_enabled_p ())
3054         dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3055       *ret_min_profitable_niters = 0;
3056       *ret_min_profitable_estimate = 0;
3057       return;
3058     }
3059
3060   /* Requires loop versioning tests to handle misalignment.  */
3061   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3062     {
3063       /*  FIXME: Make cost depend on complexity of individual check.  */
3064       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3065       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3066                             vect_prologue);
3067       if (dump_enabled_p ())
3068         dump_printf (MSG_NOTE,
3069                      "cost model: Adding cost of checks for loop "
3070                      "versioning to treat misalignment.\n");
3071     }
3072
3073   /* Requires loop versioning with alias checks.  */
3074   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3075     {
3076       /*  FIXME: Make cost depend on complexity of individual check.  */
3077       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3078       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3079                             vect_prologue);
3080       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3081       if (len)
3082         /* Count LEN - 1 ANDs and LEN comparisons.  */
3083         (void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt,
3084                               NULL, 0, vect_prologue);
3085       len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3086       if (len)
3087         {
3088           /* Count LEN - 1 ANDs and LEN comparisons.  */
3089           unsigned int nstmts = len * 2 - 1;
3090           /* +1 for each bias that needs adding.  */
3091           for (unsigned int i = 0; i < len; ++i)
3092             if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3093               nstmts += 1;
3094           (void) add_stmt_cost (target_cost_data, nstmts, scalar_stmt,
3095                                 NULL, 0, vect_prologue);
3096         }
3097       if (dump_enabled_p ())
3098         dump_printf (MSG_NOTE,
3099                      "cost model: Adding cost of checks for loop "
3100                      "versioning aliasing.\n");
3101     }
3102
3103   /* Requires loop versioning with niter checks.  */
3104   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3105     {
3106       /*  FIXME: Make cost depend on complexity of individual check.  */
3107       (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3108                             vect_prologue);
3109       if (dump_enabled_p ())
3110         dump_printf (MSG_NOTE,
3111                      "cost model: Adding cost of checks for loop "
3112                      "versioning niters.\n");
3113     }
3114
3115   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3116     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3117                           vect_prologue);
3118
3119   /* Count statements in scalar loop.  Using this as scalar cost for a single
3120      iteration for now.
3121
3122      TODO: Add outer loop support.
3123
3124      TODO: Consider assigning different costs to different scalar
3125      statements.  */
3126
3127   scalar_single_iter_cost
3128     = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3129
3130   /* Add additional cost for the peeled instructions in prologue and epilogue
3131      loop.  (For fully-masked loops there will be no peeling.)
3132
3133      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3134      at compile-time - we assume it's vf/2 (the worst would be vf-1).
3135
3136      TODO: Build an expression that represents peel_iters for prologue and
3137      epilogue to be used in a run-time test.  */
3138
3139   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3140     {
3141       peel_iters_prologue = 0;
3142       peel_iters_epilogue = 0;
3143
3144       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3145         {
3146           /* We need to peel exactly one iteration.  */
3147           peel_iters_epilogue += 1;
3148           stmt_info_for_cost *si;
3149           int j;
3150           FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
3151                             j, si)
3152             (void) add_stmt_cost (target_cost_data, si->count,
3153                                   si->kind, si->stmt_info, si->misalign,
3154                                   vect_epilogue);
3155         }
3156     }
3157   else if (npeel < 0)
3158     {
3159       peel_iters_prologue = assumed_vf / 2;
3160       if (dump_enabled_p ())
3161         dump_printf (MSG_NOTE, "cost model: "
3162                      "prologue peel iters set to vf/2.\n");
3163
3164       /* If peeling for alignment is unknown, loop bound of main loop becomes
3165          unknown.  */
3166       peel_iters_epilogue = assumed_vf / 2;
3167       if (dump_enabled_p ())
3168         dump_printf (MSG_NOTE, "cost model: "
3169                      "epilogue peel iters set to vf/2 because "
3170                      "peeling for alignment is unknown.\n");
3171
3172       /* If peeled iterations are unknown, count a taken branch and a not taken
3173          branch per peeled loop. Even if scalar loop iterations are known,
3174          vector iterations are not known since peeled prologue iterations are
3175          not known. Hence guards remain the same.  */
3176       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3177                             NULL, 0, vect_prologue);
3178       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3179                             NULL, 0, vect_prologue);
3180       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3181                             NULL, 0, vect_epilogue);
3182       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3183                             NULL, 0, vect_epilogue);
3184       stmt_info_for_cost *si;
3185       int j;
3186       FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3187         {
3188           (void) add_stmt_cost (target_cost_data,
3189                                 si->count * peel_iters_prologue,
3190                                 si->kind, si->stmt_info, si->misalign,
3191                                 vect_prologue);
3192           (void) add_stmt_cost (target_cost_data,
3193                                 si->count * peel_iters_epilogue,
3194                                 si->kind, si->stmt_info, si->misalign,
3195                                 vect_epilogue);
3196         }
3197     }
3198   else
3199     {
3200       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3201       stmt_info_for_cost *si;
3202       int j;
3203       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3204
3205       prologue_cost_vec.create (2);
3206       epilogue_cost_vec.create (2);
3207       peel_iters_prologue = npeel;
3208
3209       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3210                                           &peel_iters_epilogue,
3211                                           &LOOP_VINFO_SCALAR_ITERATION_COST
3212                                             (loop_vinfo),
3213                                           &prologue_cost_vec,
3214                                           &epilogue_cost_vec);
3215
3216       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3217         (void) add_stmt_cost (data, si->count, si->kind, si->stmt_info,
3218                               si->misalign, vect_prologue);
3219
3220       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3221         (void) add_stmt_cost (data, si->count, si->kind, si->stmt_info,
3222                               si->misalign, vect_epilogue);
3223
3224       prologue_cost_vec.release ();
3225       epilogue_cost_vec.release ();
3226     }
3227
3228   /* FORNOW: The scalar outside cost is incremented in one of the
3229      following ways:
3230
3231      1. The vectorizer checks for alignment and aliasing and generates
3232      a condition that allows dynamic vectorization.  A cost model
3233      check is ANDED with the versioning condition.  Hence scalar code
3234      path now has the added cost of the versioning check.
3235
3236        if (cost > th & versioning_check)
3237          jmp to vector code
3238
3239      Hence run-time scalar is incremented by not-taken branch cost.
3240
3241      2. The vectorizer then checks if a prologue is required.  If the
3242      cost model check was not done before during versioning, it has to
3243      be done before the prologue check.
3244
3245        if (cost <= th)
3246          prologue = scalar_iters
3247        if (prologue == 0)
3248          jmp to vector code
3249        else
3250          execute prologue
3251        if (prologue == num_iters)
3252          go to exit
3253
3254      Hence the run-time scalar cost is incremented by a taken branch,
3255      plus a not-taken branch, plus a taken branch cost.
3256
3257      3. The vectorizer then checks if an epilogue is required.  If the
3258      cost model check was not done before during prologue check, it
3259      has to be done with the epilogue check.
3260
3261        if (prologue == 0)
3262          jmp to vector code
3263        else
3264          execute prologue
3265        if (prologue == num_iters)
3266          go to exit
3267        vector code:
3268          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3269            jmp to epilogue
3270
3271      Hence the run-time scalar cost should be incremented by 2 taken
3272      branches.
3273
3274      TODO: The back end may reorder the BBS's differently and reverse
3275      conditions/branch directions.  Change the estimates below to
3276      something more reasonable.  */
3277
3278   /* If the number of iterations is known and we do not do versioning, we can
3279      decide whether to vectorize at compile time.  Hence the scalar version
3280      do not carry cost model guard costs.  */
3281   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3282       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3283     {
3284       /* Cost model check occurs at versioning.  */
3285       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3286         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3287       else
3288         {
3289           /* Cost model check occurs at prologue generation.  */
3290           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3291             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3292               + vect_get_stmt_cost (cond_branch_not_taken);
3293           /* Cost model check occurs at epilogue generation.  */
3294           else
3295             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3296         }
3297     }
3298
3299   /* Complete the target-specific cost calculations.  */
3300   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3301                &vec_inside_cost, &vec_epilogue_cost);
3302
3303   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3304
3305   if (dump_enabled_p ())
3306     {
3307       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3308       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
3309                    vec_inside_cost);
3310       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
3311                    vec_prologue_cost);
3312       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
3313                    vec_epilogue_cost);
3314       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
3315                    scalar_single_iter_cost);
3316       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
3317                    scalar_outside_cost);
3318       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
3319                    vec_outside_cost);
3320       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
3321                    peel_iters_prologue);
3322       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
3323                    peel_iters_epilogue);
3324     }
3325
3326   /* Calculate number of iterations required to make the vector version
3327      profitable, relative to the loop bodies only.  The following condition
3328      must hold true:
3329      SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
3330      where
3331      SIC = scalar iteration cost, VIC = vector iteration cost,
3332      VOC = vector outside cost, VF = vectorization factor,
3333      NPEEL = prologue iterations + epilogue iterations,
3334      SOC = scalar outside cost for run time cost model check.  */
3335
3336   int saving_per_viter = (scalar_single_iter_cost * assumed_vf
3337                           - vec_inside_cost);
3338   if (saving_per_viter <= 0)
3339     {
3340       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3341         warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
3342                     "vectorization did not happen for a simd loop");
3343
3344       if (dump_enabled_p ())
3345         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3346                          "cost model: the vector iteration cost = %d "
3347                          "divided by the scalar iteration cost = %d "
3348                          "is greater or equal to the vectorization factor = %d"
3349                          ".\n",
3350                          vec_inside_cost, scalar_single_iter_cost, assumed_vf);
3351       *ret_min_profitable_niters = -1;
3352       *ret_min_profitable_estimate = -1;
3353       return;
3354     }
3355
3356   /* ??? The "if" arm is written to handle all cases; see below for what
3357      we would do for !LOOP_VINFO_FULLY_MASKED_P.  */
3358   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3359     {
3360       /* Rewriting the condition above in terms of the number of
3361          vector iterations (vniters) rather than the number of
3362          scalar iterations (niters) gives:
3363
3364          SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
3365
3366          <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
3367
3368          For integer N, X and Y when X > 0:
3369
3370          N * X > Y <==> N >= (Y /[floor] X) + 1.  */
3371       int outside_overhead = (vec_outside_cost
3372                               - scalar_single_iter_cost * peel_iters_prologue
3373                               - scalar_single_iter_cost * peel_iters_epilogue
3374                               - scalar_outside_cost);
3375       /* We're only interested in cases that require at least one
3376          vector iteration.  */
3377       int min_vec_niters = 1;
3378       if (outside_overhead > 0)
3379         min_vec_niters = outside_overhead / saving_per_viter + 1;
3380
3381       if (dump_enabled_p ())
3382         dump_printf (MSG_NOTE, "  Minimum number of vector iterations: %d\n",
3383                      min_vec_niters);
3384
3385       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3386         {
3387           /* Now that we know the minimum number of vector iterations,
3388              find the minimum niters for which the scalar cost is larger:
3389
3390              SIC * niters > VIC * vniters + VOC - SOC
3391
3392              We know that the minimum niters is no more than
3393              vniters * VF + NPEEL, but it might be (and often is) less
3394              than that if a partial vector iteration is cheaper than the
3395              equivalent scalar code.  */
3396           int threshold = (vec_inside_cost * min_vec_niters
3397                            + vec_outside_cost
3398                            - scalar_outside_cost);
3399           if (threshold <= 0)
3400             min_profitable_iters = 1;
3401           else
3402             min_profitable_iters = threshold / scalar_single_iter_cost + 1;
3403         }
3404       else
3405         /* Convert the number of vector iterations into a number of
3406            scalar iterations.  */
3407         min_profitable_iters = (min_vec_niters * assumed_vf
3408                                 + peel_iters_prologue
3409                                 + peel_iters_epilogue);
3410     }
3411   else
3412     {
3413       min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
3414                               * assumed_vf
3415                               - vec_inside_cost * peel_iters_prologue
3416                               - vec_inside_cost * peel_iters_epilogue);
3417       if (min_profitable_iters <= 0)
3418         min_profitable_iters = 0;
3419       else
3420         {
3421           min_profitable_iters /= saving_per_viter;
3422
3423           if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
3424               <= (((int) vec_inside_cost * min_profitable_iters)
3425                   + (((int) vec_outside_cost - scalar_outside_cost)
3426                      * assumed_vf)))
3427             min_profitable_iters++;
3428         }
3429     }
3430
3431   if (dump_enabled_p ())
3432     dump_printf (MSG_NOTE,
3433                  "  Calculated minimum iters for profitability: %d\n",
3434                  min_profitable_iters);
3435
3436   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
3437       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
3438     /* We want the vectorized loop to execute at least once.  */
3439     min_profitable_iters = assumed_vf + peel_iters_prologue;
3440
3441   if (dump_enabled_p ())
3442     dump_printf_loc (MSG_NOTE, vect_location,
3443                      "  Runtime profitability threshold = %d\n",
3444                      min_profitable_iters);
3445
3446   *ret_min_profitable_niters = min_profitable_iters;
3447
3448   /* Calculate number of iterations required to make the vector version
3449      profitable, relative to the loop bodies only.
3450
3451      Non-vectorized variant is SIC * niters and it must win over vector
3452      variant on the expected loop trip count.  The following condition must hold true:
3453      SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC  */
3454
3455   if (vec_outside_cost <= 0)
3456     min_profitable_estimate = 0;
3457   else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3458     {
3459       /* This is a repeat of the code above, but with + SOC rather
3460          than - SOC.  */
3461       int outside_overhead = (vec_outside_cost
3462                               - scalar_single_iter_cost * peel_iters_prologue
3463                               - scalar_single_iter_cost * peel_iters_epilogue
3464                               + scalar_outside_cost);
3465       int min_vec_niters = 1;
3466       if (outside_overhead > 0)
3467         min_vec_niters = outside_overhead / saving_per_viter + 1;
3468
3469       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3470         {
3471           int threshold = (vec_inside_cost * min_vec_niters
3472                            + vec_outside_cost
3473                            + scalar_outside_cost);
3474           min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
3475         }
3476       else
3477         min_profitable_estimate = (min_vec_niters * assumed_vf
3478                                    + peel_iters_prologue
3479                                    + peel_iters_epilogue);
3480     }
3481   else
3482     {
3483       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
3484                                  * assumed_vf
3485                                  - vec_inside_cost * peel_iters_prologue
3486                                  - vec_inside_cost * peel_iters_epilogue)
3487                                  / ((scalar_single_iter_cost * assumed_vf)
3488                                    - vec_inside_cost);
3489     }
3490   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3491   if (dump_enabled_p ())
3492     dump_printf_loc (MSG_NOTE, vect_location,
3493                      "  Static estimate profitability threshold = %d\n",
3494                      min_profitable_estimate);
3495
3496   *ret_min_profitable_estimate = min_profitable_estimate;
3497 }
3498
3499 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3500    vector elements (not bits) for a vector with NELT elements.  */
3501 static void
3502 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
3503                               vec_perm_builder *sel)
3504 {
3505   /* The encoding is a single stepped pattern.  Any wrap-around is handled
3506      by vec_perm_indices.  */
3507   sel->new_vector (nelt, 1, 3);
3508   for (unsigned int i = 0; i < 3; i++)
3509     sel->quick_push (i + offset);
3510 }
3511
3512 /* Checks whether the target supports whole-vector shifts for vectors of mode
3513    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
3514    it supports vec_perm_const with masks for all necessary shift amounts.  */
3515 static bool
3516 have_whole_vector_shift (machine_mode mode)
3517 {
3518   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3519     return true;
3520
3521   /* Variable-length vectors should be handled via the optab.  */
3522   unsigned int nelt;
3523   if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
3524     return false;
3525
3526   vec_perm_builder sel;
3527   vec_perm_indices indices;
3528   for (unsigned int i = nelt / 2; i >= 1; i /= 2)
3529     {
3530       calc_vec_perm_mask_for_shift (i, nelt, &sel);
3531       indices.new_vector (sel, 2, nelt);
3532       if (!can_vec_perm_const_p (mode, indices, false))
3533         return false;
3534     }
3535   return true;
3536 }
3537
3538 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3539    functions. Design better to avoid maintenance issues.  */
3540
3541 /* Function vect_model_reduction_cost.
3542
3543    Models cost for a reduction operation, including the vector ops
3544    generated within the strip-mine loop, the initial definition before
3545    the loop, and the epilogue code that must be generated.  */
3546
3547 static void
3548 vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
3549                            vect_reduction_type reduction_type,
3550                            int ncopies, stmt_vector_for_cost *cost_vec)
3551 {
3552   int prologue_cost = 0, epilogue_cost = 0, inside_cost;
3553   enum tree_code code;
3554   optab optab;
3555   tree vectype;
3556   machine_mode mode;
3557   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3558   class loop *loop = NULL;
3559
3560   if (loop_vinfo)
3561     loop = LOOP_VINFO_LOOP (loop_vinfo);
3562
3563   /* Condition reductions generate two reductions in the loop.  */
3564   if (reduction_type == COND_REDUCTION)
3565     ncopies *= 2;
3566
3567   vectype = STMT_VINFO_VECTYPE (stmt_info);
3568   mode = TYPE_MODE (vectype);
3569   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
3570
3571   code = gimple_assign_rhs_code (orig_stmt_info->stmt);
3572
3573   if (reduction_type == EXTRACT_LAST_REDUCTION
3574       || reduction_type == FOLD_LEFT_REDUCTION)
3575     {
3576       /* No extra instructions needed in the prologue.  */
3577       prologue_cost = 0;
3578
3579       if (reduction_type == EXTRACT_LAST_REDUCTION || reduc_fn != IFN_LAST)
3580         /* Count one reduction-like operation per vector.  */
3581         inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
3582                                         stmt_info, 0, vect_body);
3583       else
3584         {
3585           /* Use NELEMENTS extracts and NELEMENTS scalar ops.  */
3586           unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
3587           inside_cost = record_stmt_cost (cost_vec, nelements,
3588                                           vec_to_scalar, stmt_info, 0,
3589                                           vect_body);
3590           inside_cost += record_stmt_cost (cost_vec, nelements,
3591                                            scalar_stmt, stmt_info, 0,
3592                                            vect_body);
3593         }
3594     }
3595   else
3596     {
3597       /* Add in cost for initial definition.
3598          For cond reduction we have four vectors: initial index, step,
3599          initial result of the data reduction, initial value of the index
3600          reduction.  */
3601       int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
3602       prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
3603                                          scalar_to_vec, stmt_info, 0,
3604                                          vect_prologue);
3605
3606       /* Cost of reduction op inside loop.  */
3607       inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
3608                                       stmt_info, 0, vect_body);
3609     }
3610
3611   /* Determine cost of epilogue code.
3612
3613      We have a reduction operator that will reduce the vector in one statement.
3614      Also requires scalar extract.  */
3615
3616   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
3617     {
3618       if (reduc_fn != IFN_LAST)
3619         {
3620           if (reduction_type == COND_REDUCTION)
3621             {
3622               /* An EQ stmt and an COND_EXPR stmt.  */
3623               epilogue_cost += record_stmt_cost (cost_vec, 2,
3624                                                  vector_stmt, stmt_info, 0,
3625                                                  vect_epilogue);
3626               /* Reduction of the max index and a reduction of the found
3627                  values.  */
3628               epilogue_cost += record_stmt_cost (cost_vec, 2,
3629                                                  vec_to_scalar, stmt_info, 0,
3630                                                  vect_epilogue);
3631               /* A broadcast of the max value.  */
3632               epilogue_cost += record_stmt_cost (cost_vec, 1,
3633                                                  scalar_to_vec, stmt_info, 0,
3634                                                  vect_epilogue);
3635             }
3636           else
3637             {
3638               epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
3639                                                  stmt_info, 0, vect_epilogue);
3640               epilogue_cost += record_stmt_cost (cost_vec, 1,
3641                                                  vec_to_scalar, stmt_info, 0,
3642                                                  vect_epilogue);
3643             }
3644         }
3645       else if (reduction_type == COND_REDUCTION)
3646         {
3647           unsigned estimated_nunits = vect_nunits_for_cost (vectype);
3648           /* Extraction of scalar elements.  */
3649           epilogue_cost += record_stmt_cost (cost_vec,
3650                                              2 * estimated_nunits,
3651                                              vec_to_scalar, stmt_info, 0,
3652                                              vect_epilogue);
3653           /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
3654           epilogue_cost += record_stmt_cost (cost_vec,
3655                                              2 * estimated_nunits - 3,
3656                                              scalar_stmt, stmt_info, 0,
3657                                              vect_epilogue);
3658         }
3659       else if (reduction_type == EXTRACT_LAST_REDUCTION
3660                || reduction_type == FOLD_LEFT_REDUCTION)
3661         /* No extra instructions need in the epilogue.  */
3662         ;
3663       else
3664         {
3665           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3666           tree bitsize =
3667             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt_info->stmt)));
3668           int element_bitsize = tree_to_uhwi (bitsize);
3669           int nelements = vec_size_in_bits / element_bitsize;
3670
3671           if (code == COND_EXPR)
3672             code = MAX_EXPR;
3673
3674           optab = optab_for_tree_code (code, vectype, optab_default);
3675
3676           /* We have a whole vector shift available.  */
3677           if (optab != unknown_optab
3678               && VECTOR_MODE_P (mode)
3679               && optab_handler (optab, mode) != CODE_FOR_nothing
3680               && have_whole_vector_shift (mode))
3681             {
3682               /* Final reduction via vector shifts and the reduction operator.
3683                  Also requires scalar extract.  */
3684               epilogue_cost += record_stmt_cost (cost_vec,
3685                                                  exact_log2 (nelements) * 2,
3686                                                  vector_stmt, stmt_info, 0,
3687                                                  vect_epilogue);
3688               epilogue_cost += record_stmt_cost (cost_vec, 1,
3689                                                  vec_to_scalar, stmt_info, 0,
3690                                                  vect_epilogue);
3691             }
3692           else
3693             /* Use extracts and reduction op for final reduction.  For N
3694                elements, we have N extracts and N-1 reduction ops.  */
3695             epilogue_cost += record_stmt_cost (cost_vec,
3696                                                nelements + nelements - 1,
3697                                                vector_stmt, stmt_info, 0,
3698                                                vect_epilogue);
3699         }
3700     }
3701
3702   if (dump_enabled_p ())
3703     dump_printf (MSG_NOTE,
3704                  "vect_model_reduction_cost: inside_cost = %d, "
3705                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
3706                  prologue_cost, epilogue_cost);
3707 }
3708
3709
3710 /* Function vect_model_induction_cost.
3711
3712    Models cost for induction operations.  */
3713
3714 static void
3715 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies,
3716                            stmt_vector_for_cost *cost_vec)
3717 {
3718   unsigned inside_cost, prologue_cost;
3719
3720   if (PURE_SLP_STMT (stmt_info))
3721     return;
3722
3723   /* loop cost for vec_loop.  */
3724   inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
3725                                   stmt_info, 0, vect_body);
3726
3727   /* prologue cost for vec_init and vec_step.  */
3728   prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
3729                                     stmt_info, 0, vect_prologue);
3730
3731   if (dump_enabled_p ())
3732     dump_printf_loc (MSG_NOTE, vect_location,
3733                      "vect_model_induction_cost: inside_cost = %d, "
3734                      "prologue_cost = %d .\n", inside_cost, prologue_cost);
3735 }
3736
3737
3738
3739 /* Function get_initial_def_for_reduction
3740
3741    Input:
3742    STMT_VINFO - a stmt that performs a reduction operation in the loop.
3743    INIT_VAL - the initial value of the reduction variable
3744
3745    Output:
3746    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3747         of the reduction (used for adjusting the epilog - see below).
3748    Return a vector variable, initialized according to the operation that
3749         STMT_VINFO performs. This vector will be used as the initial value
3750         of the vector of partial results.
3751
3752    Option1 (adjust in epilog): Initialize the vector as follows:
3753      add/bit or/xor:    [0,0,...,0,0]
3754      mult/bit and:      [1,1,...,1,1]
3755      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3756    and when necessary (e.g. add/mult case) let the caller know
3757    that it needs to adjust the result by init_val.
3758
3759    Option2: Initialize the vector as follows:
3760      add/bit or/xor:    [init_val,0,0,...,0]
3761      mult/bit and:      [init_val,1,1,...,1]
3762      min/max/cond_expr: [init_val,init_val,...,init_val]
3763    and no adjustments are needed.
3764
3765    For example, for the following code:
3766
3767    s = init_val;
3768    for (i=0;i<n;i++)
3769      s = s + a[i];
3770
3771    STMT_VINFO is 's = s + a[i]', and the reduction variable is 's'.
3772    For a vector of 4 units, we want to return either [0,0,0,init_val],
3773    or [0,0,0,0] and let the caller know that it needs to adjust
3774    the result at the end by 'init_val'.
3775
3776    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3777    initialization vector is simpler (same element in all entries), if
3778    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3779
3780    A cost model should help decide between these two schemes.  */
3781
3782 static tree
3783 get_initial_def_for_reduction (stmt_vec_info stmt_vinfo,
3784                                enum tree_code code, tree init_val,
3785                                tree *adjustment_def)
3786 {
3787   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3788   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3789   tree scalar_type = TREE_TYPE (init_val);
3790   tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
3791   tree def_for_init;
3792   tree init_def;
3793   REAL_VALUE_TYPE real_init_val = dconst0;
3794   int int_init_val = 0;
3795   gimple_seq stmts = NULL;
3796
3797   gcc_assert (vectype);
3798
3799   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
3800               || SCALAR_FLOAT_TYPE_P (scalar_type));
3801
3802   gcc_assert (nested_in_vect_loop_p (loop, stmt_vinfo)
3803               || loop == (gimple_bb (stmt_vinfo->stmt))->loop_father);
3804
3805   /* ADJUSTMENT_DEF is NULL when called from
3806      vect_create_epilog_for_reduction to vectorize double reduction.  */
3807   if (adjustment_def)
3808     *adjustment_def = NULL;
3809
3810   switch (code)
3811     {
3812     case WIDEN_SUM_EXPR:
3813     case DOT_PROD_EXPR:
3814     case SAD_EXPR:
3815     case PLUS_EXPR:
3816     case MINUS_EXPR:
3817     case BIT_IOR_EXPR:
3818     case BIT_XOR_EXPR:
3819     case MULT_EXPR:
3820     case BIT_AND_EXPR:
3821       {
3822         if (code == MULT_EXPR)
3823           {
3824             real_init_val = dconst1;
3825             int_init_val = 1;
3826           }
3827
3828         if (code == BIT_AND_EXPR)
3829           int_init_val = -1;
3830
3831         if (SCALAR_FLOAT_TYPE_P (scalar_type))
3832           def_for_init = build_real (scalar_type, real_init_val);
3833         else
3834           def_for_init = build_int_cst (scalar_type, int_init_val);
3835
3836         if (adjustment_def || operand_equal_p (def_for_init, init_val, 0))
3837           {
3838             /* Option1: the first element is '0' or '1' as well.  */
3839             if (!operand_equal_p (def_for_init, init_val, 0))
3840               *adjustment_def = init_val;
3841             init_def = gimple_build_vector_from_val (&stmts, vectype,
3842                                                      def_for_init);
3843           }
3844         else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
3845           {
3846             /* Option2 (variable length): the first element is INIT_VAL.  */
3847             init_def = gimple_build_vector_from_val (&stmts, vectype,
3848                                                      def_for_init);
3849             init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
3850                                      vectype, init_def, init_val);
3851           }
3852         else
3853           {
3854             /* Option2: the first element is INIT_VAL.  */
3855             tree_vector_builder elts (vectype, 1, 2);
3856             elts.quick_push (init_val);
3857             elts.quick_push (def_for_init);
3858             init_def = gimple_build_vector (&stmts, &elts);
3859           }
3860       }
3861       break;
3862
3863     case MIN_EXPR:
3864     case MAX_EXPR:
3865     case COND_EXPR:
3866       {
3867         init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
3868         init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
3869       }
3870       break;
3871
3872     default:
3873       gcc_unreachable ();
3874     }
3875
3876   if (stmts)
3877     gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
3878   return init_def;
3879 }
3880
3881 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
3882    NUMBER_OF_VECTORS is the number of vector defs to create.
3883    If NEUTRAL_OP is nonnull, introducing extra elements of that
3884    value will not change the result.  */
3885
3886 static void
3887 get_initial_defs_for_reduction (slp_tree slp_node,
3888                                 vec<tree> *vec_oprnds,
3889                                 unsigned int number_of_vectors,
3890                                 bool reduc_chain, tree neutral_op)
3891 {
3892   vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
3893   stmt_vec_info stmt_vinfo = stmts[0];
3894   vec_info *vinfo = stmt_vinfo->vinfo;
3895   unsigned HOST_WIDE_INT nunits;
3896   unsigned j, number_of_places_left_in_vector;
3897   tree vector_type;
3898   unsigned int group_size = stmts.length ();
3899   unsigned int i;
3900   class loop *loop;
3901
3902   vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
3903
3904   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
3905
3906   loop = (gimple_bb (stmt_vinfo->stmt))->loop_father;
3907   gcc_assert (loop);
3908   edge pe = loop_preheader_edge (loop);
3909
3910   gcc_assert (!reduc_chain || neutral_op);
3911
3912   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
3913      created vectors. It is greater than 1 if unrolling is performed.
3914
3915      For example, we have two scalar operands, s1 and s2 (e.g., group of
3916      strided accesses of size two), while NUNITS is four (i.e., four scalars
3917      of this type can be packed in a vector).  The output vector will contain
3918      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
3919      will be 2).
3920
3921      If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
3922      vectors containing the operands.
3923
3924      For example, NUNITS is four as before, and the group size is 8
3925      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
3926      {s5, s6, s7, s8}.  */
3927
3928   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
3929     nunits = group_size;
3930
3931   number_of_places_left_in_vector = nunits;
3932   bool constant_p = true;
3933   tree_vector_builder elts (vector_type, nunits, 1);
3934   elts.quick_grow (nunits);
3935   gimple_seq ctor_seq = NULL;
3936   for (j = 0; j < nunits * number_of_vectors; ++j)
3937     {
3938       tree op;
3939       i = j % group_size;
3940       stmt_vinfo = stmts[i];
3941
3942       /* Get the def before the loop.  In reduction chain we have only
3943          one initial value.  Else we have as many as PHIs in the group.  */
3944       if (reduc_chain)
3945         op = j != 0 ? neutral_op : PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
3946       else if (((vec_oprnds->length () + 1) * nunits
3947                 - number_of_places_left_in_vector >= group_size)
3948                && neutral_op)
3949         op = neutral_op;
3950       else
3951         op = PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
3952
3953       /* Create 'vect_ = {op0,op1,...,opn}'.  */
3954       number_of_places_left_in_vector--;
3955       elts[nunits - number_of_places_left_in_vector - 1] = op;
3956       if (!CONSTANT_CLASS_P (op))
3957         constant_p = false;
3958
3959       if (number_of_places_left_in_vector == 0)
3960         {
3961           tree init;
3962           if (constant_p && !neutral_op
3963               ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
3964               : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
3965             /* Build the vector directly from ELTS.  */
3966             init = gimple_build_vector (&ctor_seq, &elts);
3967           else if (neutral_op)
3968             {
3969               /* Build a vector of the neutral value and shift the
3970                  other elements into place.  */
3971               init = gimple_build_vector_from_val (&ctor_seq, vector_type,
3972                                                    neutral_op);
3973               int k = nunits;
3974               while (k > 0 && elts[k - 1] == neutral_op)
3975                 k -= 1;
3976               while (k > 0)
3977                 {
3978                   k -= 1;
3979                   init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
3980                                        vector_type, init, elts[k]);
3981                 }
3982             }
3983           else
3984             {
3985               /* First time round, duplicate ELTS to fill the
3986                  required number of vectors.  */
3987               duplicate_and_interleave (vinfo, &ctor_seq, vector_type, elts,
3988                                         number_of_vectors, *vec_oprnds);
3989               break;
3990             }
3991           vec_oprnds->quick_push (init);
3992
3993           number_of_places_left_in_vector = nunits;
3994           elts.new_vector (vector_type, nunits, 1);
3995           elts.quick_grow (nunits);
3996           constant_p = true;
3997         }
3998     }
3999   if (ctor_seq != NULL)
4000     gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4001 }
4002
4003 /* For a statement STMT_INFO taking part in a reduction operation return
4004    the stmt_vec_info the meta information is stored on.  */
4005
4006 stmt_vec_info
4007 info_for_reduction (stmt_vec_info stmt_info)
4008 {
4009   stmt_info = vect_orig_stmt (stmt_info);
4010   gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
4011   if (!is_a <gphi *> (stmt_info->stmt))
4012     stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4013   gphi *phi = as_a <gphi *> (stmt_info->stmt);
4014   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
4015     {
4016       if (gimple_phi_num_args (phi) == 1)
4017         stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4018     }
4019   else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
4020     {
4021       edge pe = loop_preheader_edge (gimple_bb (phi)->loop_father);
4022       stmt_vec_info info
4023           = stmt_info->vinfo->lookup_def (PHI_ARG_DEF_FROM_EDGE (phi, pe));
4024       if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
4025         stmt_info = info;
4026     }
4027   return stmt_info;
4028 }
4029
4030 /* Function vect_create_epilog_for_reduction
4031
4032    Create code at the loop-epilog to finalize the result of a reduction
4033    computation.
4034
4035    STMT_INFO is the scalar reduction stmt that is being vectorized.
4036    SLP_NODE is an SLP node containing a group of reduction statements. The
4037      first one in this group is STMT_INFO.
4038    SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
4039    REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
4040      (counting from 0)
4041
4042    This function:
4043    1. Completes the reduction def-use cycles.
4044    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4045       by calling the function specified by REDUC_FN if available, or by
4046       other means (whole-vector shifts or a scalar loop).
4047       The function also creates a new phi node at the loop exit to preserve
4048       loop-closed form, as illustrated below.
4049
4050      The flow at the entry to this function:
4051
4052         loop:
4053           vec_def = phi <vec_init, null>        # REDUCTION_PHI
4054           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
4055           s_loop = scalar_stmt                  # (scalar) STMT_INFO
4056         loop_exit:
4057           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4058           use <s_out0>
4059           use <s_out0>
4060
4061      The above is transformed by this function into:
4062
4063         loop:
4064           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4065           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
4066           s_loop = scalar_stmt                  # (scalar) STMT_INFO
4067         loop_exit:
4068           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4069           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4070           v_out2 = reduce <v_out1>
4071           s_out3 = extract_field <v_out2, 0>
4072           s_out4 = adjust_result <s_out3>
4073           use <s_out4>
4074           use <s_out4>
4075 */
4076
4077 static void
4078 vect_create_epilog_for_reduction (stmt_vec_info stmt_info,
4079                                   slp_tree slp_node,
4080                                   slp_instance slp_node_instance)
4081 {
4082   stmt_vec_info reduc_info = info_for_reduction (stmt_info);
4083   gcc_assert (reduc_info->is_reduc_info);
4084   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4085   /* For double reductions we need to get at the inner loop reduction
4086      stmt which has the meta info attached.  Our stmt_info is that of the
4087      loop-closed PHI of the inner loop which we remember as
4088      def for the reduction PHI generation.  */
4089   bool double_reduc = false;
4090   stmt_vec_info rdef_info = stmt_info;
4091   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
4092     {
4093       gcc_assert (!slp_node);
4094       double_reduc = true;
4095       stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
4096                                             (stmt_info->stmt, 0));
4097       stmt_info = vect_stmt_to_vectorize (stmt_info);
4098     }
4099   gphi *reduc_def_stmt
4100     = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
4101   enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
4102   internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
4103   tree neutral_op = NULL_TREE;
4104   if (slp_node)
4105     neutral_op
4106       = neutral_op_for_slp_reduction (slp_node_instance->reduc_phis, code,
4107                                       REDUC_GROUP_FIRST_ELEMENT (stmt_info));
4108   stmt_vec_info prev_phi_info;
4109   tree vectype;
4110   machine_mode mode;
4111   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4112   basic_block exit_bb;
4113   tree scalar_dest;
4114   tree scalar_type;
4115   gimple *new_phi = NULL, *phi;
4116   stmt_vec_info phi_info;
4117   gimple_stmt_iterator exit_gsi;
4118   tree vec_dest;
4119   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4120   gimple *epilog_stmt = NULL;
4121   gimple *exit_phi;
4122   tree bitsize;
4123   tree expr, def;
4124   tree orig_name, scalar_result;
4125   imm_use_iterator imm_iter, phi_imm_iter;
4126   use_operand_p use_p, phi_use_p;
4127   gimple *use_stmt;
4128   bool nested_in_vect_loop = false;
4129   auto_vec<gimple *> new_phis;
4130   int j, i;
4131   auto_vec<tree> scalar_results;
4132   unsigned int group_size = 1, k;
4133   auto_vec<gimple *> phis;
4134   bool slp_reduc = false;
4135   bool direct_slp_reduc;
4136   tree new_phi_result;
4137   tree induction_index = NULL_TREE;
4138
4139   if (slp_node)
4140     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4141
4142   if (nested_in_vect_loop_p (loop, stmt_info))
4143     {
4144       outer_loop = loop;
4145       loop = loop->inner;
4146       nested_in_vect_loop = true;
4147       gcc_assert (!slp_node);
4148     }
4149   gcc_assert (!nested_in_vect_loop || double_reduc);
4150
4151   vectype = STMT_VINFO_VECTYPE (stmt_info);
4152   gcc_assert (vectype);
4153   mode = TYPE_MODE (vectype);
4154
4155   tree initial_def = NULL;
4156   tree induc_val = NULL_TREE;
4157   tree adjustment_def = NULL;
4158   if (slp_node)
4159     ;
4160   else
4161     {
4162       /* Get at the scalar def before the loop, that defines the initial value
4163          of the reduction variable.  */
4164       initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4165                                            loop_preheader_edge (loop));
4166       /* Optimize: for induction condition reduction, if we can't use zero
4167          for induc_val, use initial_def.  */
4168       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
4169         induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
4170       else if (double_reduc)
4171         ;
4172       else if (nested_in_vect_loop)
4173         ;
4174       else
4175         adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
4176     }
4177
4178   unsigned vec_num;
4179   int ncopies;
4180   if (slp_node)
4181     {
4182       vec_num = SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis).length ();
4183       ncopies = 1;
4184     }
4185   else
4186     {
4187       vec_num = 1;
4188       ncopies = 0;
4189       phi_info = STMT_VINFO_VEC_STMT (loop_vinfo->lookup_stmt (reduc_def_stmt));
4190       do
4191         {
4192           ncopies++;
4193           phi_info = STMT_VINFO_RELATED_STMT (phi_info);
4194         }
4195       while (phi_info);
4196     }
4197
4198   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4199      which is updated with the current index of the loop for every match of
4200      the original loop's cond_expr (VEC_STMT).  This results in a vector
4201      containing the last time the condition passed for that vector lane.
4202      The first match will be a 1 to allow 0 to be used for non-matching
4203      indexes.  If there are no matches at all then the vector will be all
4204      zeroes.  */
4205   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
4206     {
4207       tree indx_before_incr, indx_after_incr;
4208       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4209
4210       gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info)->stmt;
4211       gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4212
4213       int scalar_precision
4214         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4215       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4216       tree cr_index_vector_type = build_vector_type
4217         (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype));
4218
4219       /* First we create a simple vector induction variable which starts
4220          with the values {1,2,3,...} (SERIES_VECT) and increments by the
4221          vector size (STEP).  */
4222
4223       /* Create a {1,2,3,...} vector.  */
4224       tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
4225
4226       /* Create a vector of the step value.  */
4227       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4228       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4229
4230       /* Create an induction variable.  */
4231       gimple_stmt_iterator incr_gsi;
4232       bool insert_after;
4233       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4234       create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4235                  insert_after, &indx_before_incr, &indx_after_incr);
4236
4237       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4238          filled with zeros (VEC_ZERO).  */
4239
4240       /* Create a vector of 0s.  */
4241       tree zero = build_zero_cst (cr_index_scalar_type);
4242       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4243
4244       /* Create a vector phi node.  */
4245       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4246       new_phi = create_phi_node (new_phi_tree, loop->header);
4247       loop_vinfo->add_stmt (new_phi);
4248       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4249                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
4250
4251       /* Now take the condition from the loops original cond_expr
4252          (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
4253          every match uses values from the induction variable
4254          (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4255          (NEW_PHI_TREE).
4256          Finally, we update the phi (NEW_PHI_TREE) to take the value of
4257          the new cond_expr (INDEX_COND_EXPR).  */
4258
4259       /* Duplicate the condition from vec_stmt.  */
4260       tree ccompare = unshare_expr (gimple_assign_rhs1 (vec_stmt));
4261
4262       /* Create a conditional, where the condition is taken from vec_stmt
4263          (CCOMPARE).  The then and else values mirror the main VEC_COND_EXPR:
4264          the reduction phi corresponds to NEW_PHI_TREE and the new values
4265          correspond to INDEX_BEFORE_INCR.  */
4266       gcc_assert (STMT_VINFO_REDUC_IDX (reduc_info) >= 1);
4267       tree index_cond_expr;
4268       if (STMT_VINFO_REDUC_IDX (reduc_info) == 2)
4269         index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
4270                                   ccompare, indx_before_incr, new_phi_tree);
4271       else
4272         index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
4273                                   ccompare, new_phi_tree, indx_before_incr);
4274       induction_index = make_ssa_name (cr_index_vector_type);
4275       gimple *index_condition = gimple_build_assign (induction_index,
4276                                                      index_cond_expr);
4277       gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
4278       stmt_vec_info index_vec_info = loop_vinfo->add_stmt (index_condition);
4279       STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
4280
4281       /* Update the phi with the vec cond.  */
4282       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4283                    loop_latch_edge (loop), UNKNOWN_LOCATION);
4284     }
4285
4286   /* 2. Create epilog code.
4287         The reduction epilog code operates across the elements of the vector
4288         of partial results computed by the vectorized loop.
4289         The reduction epilog code consists of:
4290
4291         step 1: compute the scalar result in a vector (v_out2)
4292         step 2: extract the scalar result (s_out3) from the vector (v_out2)
4293         step 3: adjust the scalar result (s_out3) if needed.
4294
4295         Step 1 can be accomplished using one the following three schemes:
4296           (scheme 1) using reduc_fn, if available.
4297           (scheme 2) using whole-vector shifts, if available.
4298           (scheme 3) using a scalar loop. In this case steps 1+2 above are
4299                      combined.
4300
4301           The overall epilog code looks like this:
4302
4303           s_out0 = phi <s_loop>         # original EXIT_PHI
4304           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
4305           v_out2 = reduce <v_out1>              # step 1
4306           s_out3 = extract_field <v_out2, 0>    # step 2
4307           s_out4 = adjust_result <s_out3>       # step 3
4308
4309           (step 3 is optional, and steps 1 and 2 may be combined).
4310           Lastly, the uses of s_out0 are replaced by s_out4.  */
4311
4312
4313   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4314          v_out1 = phi <VECT_DEF>
4315          Store them in NEW_PHIS.  */
4316   if (double_reduc)
4317     loop = outer_loop;
4318   exit_bb = single_exit (loop)->dest;
4319   prev_phi_info = NULL;
4320   new_phis.create (slp_node ? vec_num : ncopies);
4321   for (unsigned i = 0; i < vec_num; i++)
4322     {
4323       if (slp_node)
4324         def = gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[i]->stmt);
4325       else
4326         def = gimple_get_lhs (STMT_VINFO_VEC_STMT (rdef_info)->stmt);
4327       for (j = 0; j < ncopies; j++)
4328         {
4329           tree new_def = copy_ssa_name (def);
4330           phi = create_phi_node (new_def, exit_bb);
4331           stmt_vec_info phi_info = loop_vinfo->add_stmt (phi);
4332           if (j == 0)
4333             new_phis.quick_push (phi);
4334           else
4335             {
4336               def = vect_get_vec_def_for_stmt_copy (loop_vinfo, def);
4337               STMT_VINFO_RELATED_STMT (prev_phi_info) = phi_info;
4338             }
4339
4340           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4341           prev_phi_info = phi_info;
4342         }
4343     }
4344
4345   exit_gsi = gsi_after_labels (exit_bb);
4346
4347   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4348          (i.e. when reduc_fn is not available) and in the final adjustment
4349          code (if needed).  Also get the original scalar reduction variable as
4350          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
4351          represents a reduction pattern), the tree-code and scalar-def are
4352          taken from the original stmt that the pattern-stmt (STMT) replaces.
4353          Otherwise (it is a regular reduction) - the tree-code and scalar-def
4354          are taken from STMT.  */
4355
4356   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4357   if (orig_stmt_info != stmt_info)
4358     {
4359       /* Reduction pattern  */
4360       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4361       gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
4362     }
4363
4364   scalar_dest = gimple_assign_lhs (orig_stmt_info->stmt);
4365   scalar_type = TREE_TYPE (scalar_dest);
4366   scalar_results.create (group_size);
4367   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4368   bitsize = TYPE_SIZE (scalar_type);
4369
4370   /* SLP reduction without reduction chain, e.g.,
4371      # a1 = phi <a2, a0>
4372      # b1 = phi <b2, b0>
4373      a2 = operation (a1)
4374      b2 = operation (b1)  */
4375   slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
4376
4377   /* True if we should implement SLP_REDUC using native reduction operations
4378      instead of scalar operations.  */
4379   direct_slp_reduc = (reduc_fn != IFN_LAST
4380                       && slp_reduc
4381                       && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
4382
4383   /* In case of reduction chain, e.g.,
4384      # a1 = phi <a3, a0>
4385      a2 = operation (a1)
4386      a3 = operation (a2),
4387
4388      we may end up with more than one vector result.  Here we reduce them to
4389      one vector.  */
4390   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) || direct_slp_reduc)
4391     {
4392       tree first_vect = PHI_RESULT (new_phis[0]);
4393       gassign *new_vec_stmt = NULL;
4394       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4395       for (k = 1; k < new_phis.length (); k++)
4396         {
4397           gimple *next_phi = new_phis[k];
4398           tree second_vect = PHI_RESULT (next_phi);
4399           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4400           new_vec_stmt = gimple_build_assign (tem, code,
4401                                               first_vect, second_vect);
4402           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4403           first_vect = tem;
4404         }
4405
4406       new_phi_result = first_vect;
4407       if (new_vec_stmt)
4408         {
4409           new_phis.truncate (0);
4410           new_phis.safe_push (new_vec_stmt);
4411         }
4412     }
4413   /* Likewise if we couldn't use a single defuse cycle.  */
4414   else if (ncopies > 1)
4415     {
4416       gcc_assert (new_phis.length () == 1);
4417       tree first_vect = PHI_RESULT (new_phis[0]);
4418       gassign *new_vec_stmt = NULL;
4419       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4420       stmt_vec_info next_phi_info = loop_vinfo->lookup_stmt (new_phis[0]);
4421       for (int k = 1; k < ncopies; ++k)
4422         {
4423           next_phi_info = STMT_VINFO_RELATED_STMT (next_phi_info);
4424           tree second_vect = PHI_RESULT (next_phi_info->stmt);
4425           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4426           new_vec_stmt = gimple_build_assign (tem, code,
4427                                               first_vect, second_vect);
4428           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4429           first_vect = tem;
4430         }
4431       new_phi_result = first_vect;
4432       new_phis.truncate (0);
4433       new_phis.safe_push (new_vec_stmt);
4434     }
4435   else
4436     new_phi_result = PHI_RESULT (new_phis[0]);
4437
4438   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
4439       && reduc_fn != IFN_LAST)
4440     {
4441       /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4442          various data values where the condition matched and another vector
4443          (INDUCTION_INDEX) containing all the indexes of those matches.  We
4444          need to extract the last matching index (which will be the index with
4445          highest value) and use this to index into the data vector.
4446          For the case where there were no matches, the data vector will contain
4447          all default values and the index vector will be all zeros.  */
4448
4449       /* Get various versions of the type of the vector of indexes.  */
4450       tree index_vec_type = TREE_TYPE (induction_index);
4451       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4452       tree index_scalar_type = TREE_TYPE (index_vec_type);
4453       tree index_vec_cmp_type = build_same_sized_truth_vector_type
4454         (index_vec_type);
4455
4456       /* Get an unsigned integer version of the type of the data vector.  */
4457       int scalar_precision
4458         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
4459       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4460       tree vectype_unsigned = build_vector_type
4461         (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4462
4463       /* First we need to create a vector (ZERO_VEC) of zeros and another
4464          vector (MAX_INDEX_VEC) filled with the last matching index, which we
4465          can create using a MAX reduction and then expanding.
4466          In the case where the loop never made any matches, the max index will
4467          be zero.  */
4468
4469       /* Vector of {0, 0, 0,...}.  */
4470       tree zero_vec = make_ssa_name (vectype);
4471       tree zero_vec_rhs = build_zero_cst (vectype);
4472       gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
4473       gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
4474
4475       /* Find maximum value from the vector of found indexes.  */
4476       tree max_index = make_ssa_name (index_scalar_type);
4477       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4478                                                           1, induction_index);
4479       gimple_call_set_lhs (max_index_stmt, max_index);
4480       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
4481
4482       /* Vector of {max_index, max_index, max_index,...}.  */
4483       tree max_index_vec = make_ssa_name (index_vec_type);
4484       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
4485                                                       max_index);
4486       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
4487                                                         max_index_vec_rhs);
4488       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
4489
4490       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
4491          with the vector (INDUCTION_INDEX) of found indexes, choosing values
4492          from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
4493          otherwise.  Only one value should match, resulting in a vector
4494          (VEC_COND) with one data value and the rest zeros.
4495          In the case where the loop never made any matches, every index will
4496          match, resulting in a vector with all data values (which will all be
4497          the default value).  */
4498
4499       /* Compare the max index vector to the vector of found indexes to find
4500          the position of the max value.  */
4501       tree vec_compare = make_ssa_name (index_vec_cmp_type);
4502       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
4503                                                       induction_index,
4504                                                       max_index_vec);
4505       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
4506
4507       /* Use the compare to choose either values from the data vector or
4508          zero.  */
4509       tree vec_cond = make_ssa_name (vectype);
4510       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
4511                                                    vec_compare, new_phi_result,
4512                                                    zero_vec);
4513       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
4514
4515       /* Finally we need to extract the data value from the vector (VEC_COND)
4516          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
4517          reduction, but because this doesn't exist, we can use a MAX reduction
4518          instead.  The data value might be signed or a float so we need to cast
4519          it first.
4520          In the case where the loop never made any matches, the data values are
4521          all identical, and so will reduce down correctly.  */
4522
4523       /* Make the matched data values unsigned.  */
4524       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
4525       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
4526                                        vec_cond);
4527       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
4528                                                         VIEW_CONVERT_EXPR,
4529                                                         vec_cond_cast_rhs);
4530       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
4531
4532       /* Reduce down to a scalar value.  */
4533       tree data_reduc = make_ssa_name (scalar_type_unsigned);
4534       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4535                                                            1, vec_cond_cast);
4536       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
4537       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
4538
4539       /* Convert the reduced value back to the result type and set as the
4540          result.  */
4541       gimple_seq stmts = NULL;
4542       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
4543                                data_reduc);
4544       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4545       scalar_results.safe_push (new_temp);
4546     }
4547   else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
4548            && reduc_fn == IFN_LAST)
4549     {
4550       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
4551          idx = 0;
4552          idx_val = induction_index[0];
4553          val = data_reduc[0];
4554          for (idx = 0, val = init, i = 0; i < nelts; ++i)
4555            if (induction_index[i] > idx_val)
4556              val = data_reduc[i], idx_val = induction_index[i];
4557          return val;  */
4558
4559       tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
4560       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
4561       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
4562       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
4563       /* Enforced by vectorizable_reduction, which ensures we have target
4564          support before allowing a conditional reduction on variable-length
4565          vectors.  */
4566       unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
4567       tree idx_val = NULL_TREE, val = NULL_TREE;
4568       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
4569         {
4570           tree old_idx_val = idx_val;
4571           tree old_val = val;
4572           idx_val = make_ssa_name (idx_eltype);
4573           epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
4574                                              build3 (BIT_FIELD_REF, idx_eltype,
4575                                                      induction_index,
4576                                                      bitsize_int (el_size),
4577                                                      bitsize_int (off)));
4578           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4579           val = make_ssa_name (data_eltype);
4580           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
4581                                              build3 (BIT_FIELD_REF,
4582                                                      data_eltype,
4583                                                      new_phi_result,
4584                                                      bitsize_int (el_size),
4585                                                      bitsize_int (off)));
4586           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4587           if (off != 0)
4588             {
4589               tree new_idx_val = idx_val;
4590               if (off != v_size - el_size)
4591                 {
4592                   new_idx_val = make_ssa_name (idx_eltype);
4593                   epilog_stmt = gimple_build_assign (new_idx_val,
4594                                                      MAX_EXPR, idx_val,
4595                                                      old_idx_val);
4596                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4597                 }
4598               tree new_val = make_ssa_name (data_eltype);
4599               epilog_stmt = gimple_build_assign (new_val,
4600                                                  COND_EXPR,
4601                                                  build2 (GT_EXPR,
4602                                                          boolean_type_node,
4603                                                          idx_val,
4604                                                          old_idx_val),
4605                                                  val, old_val);
4606               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4607               idx_val = new_idx_val;
4608               val = new_val;
4609             }
4610         }
4611       /* Convert the reduced value back to the result type and set as the
4612          result.  */
4613       gimple_seq stmts = NULL;
4614       val = gimple_convert (&stmts, scalar_type, val);
4615       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4616       scalar_results.safe_push (val);
4617     }
4618
4619   /* 2.3 Create the reduction code, using one of the three schemes described
4620          above. In SLP we simply need to extract all the elements from the
4621          vector (without reducing them), so we use scalar shifts.  */
4622   else if (reduc_fn != IFN_LAST && !slp_reduc)
4623     {
4624       tree tmp;
4625       tree vec_elem_type;
4626
4627       /* Case 1:  Create:
4628          v_out2 = reduc_expr <v_out1>  */
4629
4630       if (dump_enabled_p ())
4631         dump_printf_loc (MSG_NOTE, vect_location,
4632                          "Reduce using direct vector reduction.\n");
4633
4634       vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
4635       if (!useless_type_conversion_p (scalar_type, vec_elem_type))
4636         {
4637           tree tmp_dest
4638             = vect_create_destination_var (scalar_dest, vec_elem_type);
4639           epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
4640                                                     new_phi_result);
4641           gimple_set_lhs (epilog_stmt, tmp_dest);
4642           new_temp = make_ssa_name (tmp_dest, epilog_stmt);
4643           gimple_set_lhs (epilog_stmt, new_temp);
4644           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4645
4646           epilog_stmt = gimple_build_assign (new_scalar_dest, NOP_EXPR,
4647                                              new_temp);
4648         }
4649       else
4650         {
4651           epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
4652                                                     new_phi_result);
4653           gimple_set_lhs (epilog_stmt, new_scalar_dest);
4654         }
4655
4656       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4657       gimple_set_lhs (epilog_stmt, new_temp);
4658       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4659
4660       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
4661           && induc_val)
4662         {
4663           /* Earlier we set the initial value to be a vector if induc_val
4664              values.  Check the result and if it is induc_val then replace
4665              with the original initial value, unless induc_val is
4666              the same as initial_def already.  */
4667           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
4668                                   induc_val);
4669
4670           tmp = make_ssa_name (new_scalar_dest);
4671           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
4672                                              initial_def, new_temp);
4673           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4674           new_temp = tmp;
4675         }
4676
4677       scalar_results.safe_push (new_temp);
4678     }
4679   else if (direct_slp_reduc)
4680     {
4681       /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
4682          with the elements for other SLP statements replaced with the
4683          neutral value.  We can then do a normal reduction on each vector.  */
4684
4685       /* Enforced by vectorizable_reduction.  */
4686       gcc_assert (new_phis.length () == 1);
4687       gcc_assert (pow2p_hwi (group_size));
4688
4689       slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
4690       vec<stmt_vec_info> orig_phis
4691         = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
4692       gimple_seq seq = NULL;
4693
4694       /* Build a vector {0, 1, 2, ...}, with the same number of elements
4695          and the same element size as VECTYPE.  */
4696       tree index = build_index_vector (vectype, 0, 1);
4697       tree index_type = TREE_TYPE (index);
4698       tree index_elt_type = TREE_TYPE (index_type);
4699       tree mask_type = build_same_sized_truth_vector_type (index_type);
4700
4701       /* Create a vector that, for each element, identifies which of
4702          the REDUC_GROUP_SIZE results should use it.  */
4703       tree index_mask = build_int_cst (index_elt_type, group_size - 1);
4704       index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
4705                             build_vector_from_val (index_type, index_mask));
4706
4707       /* Get a neutral vector value.  This is simply a splat of the neutral
4708          scalar value if we have one, otherwise the initial scalar value
4709          is itself a neutral value.  */
4710       tree vector_identity = NULL_TREE;
4711       if (neutral_op)
4712         vector_identity = gimple_build_vector_from_val (&seq, vectype,
4713                                                         neutral_op);
4714       for (unsigned int i = 0; i < group_size; ++i)
4715         {
4716           /* If there's no univeral neutral value, we can use the
4717              initial scalar value from the original PHI.  This is used
4718              for MIN and MAX reduction, for example.  */
4719           if (!neutral_op)
4720             {
4721               tree scalar_value
4722                 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i]->stmt,
4723                                          loop_preheader_edge (loop));
4724               vector_identity = gimple_build_vector_from_val (&seq, vectype,
4725                                                               scalar_value);
4726             }
4727
4728           /* Calculate the equivalent of:
4729
4730              sel[j] = (index[j] == i);
4731
4732              which selects the elements of NEW_PHI_RESULT that should
4733              be included in the result.  */
4734           tree compare_val = build_int_cst (index_elt_type, i);
4735           compare_val = build_vector_from_val (index_type, compare_val);
4736           tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
4737                                    index, compare_val);
4738
4739           /* Calculate the equivalent of:
4740
4741              vec = seq ? new_phi_result : vector_identity;
4742
4743              VEC is now suitable for a full vector reduction.  */
4744           tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
4745                                    sel, new_phi_result, vector_identity);
4746
4747           /* Do the reduction and convert it to the appropriate type.  */
4748           tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
4749                                       TREE_TYPE (vectype), vec);
4750           scalar = gimple_convert (&seq, scalar_type, scalar);
4751           scalar_results.safe_push (scalar);
4752         }
4753       gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
4754     }
4755   else
4756     {
4757       bool reduce_with_shift;
4758       tree vec_temp;
4759
4760       /* See if the target wants to do the final (shift) reduction
4761          in a vector mode of smaller size and first reduce upper/lower
4762          halves against each other.  */
4763       enum machine_mode mode1 = mode;
4764       unsigned sz = tree_to_uhwi (TYPE_SIZE_UNIT (vectype));
4765       unsigned sz1 = sz;
4766       if (!slp_reduc
4767           && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
4768         sz1 = GET_MODE_SIZE (mode1).to_constant ();
4769
4770       tree vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz1);
4771       reduce_with_shift = have_whole_vector_shift (mode1);
4772       if (!VECTOR_MODE_P (mode1))
4773         reduce_with_shift = false;
4774       else
4775         {
4776           optab optab = optab_for_tree_code (code, vectype1, optab_default);
4777           if (optab_handler (optab, mode1) == CODE_FOR_nothing)
4778             reduce_with_shift = false;
4779         }
4780
4781       /* First reduce the vector to the desired vector size we should
4782          do shift reduction on by combining upper and lower halves.  */
4783       new_temp = new_phi_result;
4784       while (sz > sz1)
4785         {
4786           gcc_assert (!slp_reduc);
4787           sz /= 2;
4788           vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz);
4789
4790           /* The target has to make sure we support lowpart/highpart
4791              extraction, either via direct vector extract or through
4792              an integer mode punning.  */
4793           tree dst1, dst2;
4794           if (convert_optab_handler (vec_extract_optab,
4795                                      TYPE_MODE (TREE_TYPE (new_temp)),
4796                                      TYPE_MODE (vectype1))
4797               != CODE_FOR_nothing)
4798             {
4799               /* Extract sub-vectors directly once vec_extract becomes
4800                  a conversion optab.  */
4801               dst1 = make_ssa_name (vectype1);
4802               epilog_stmt
4803                   = gimple_build_assign (dst1, BIT_FIELD_REF,
4804                                          build3 (BIT_FIELD_REF, vectype1,
4805                                                  new_temp, TYPE_SIZE (vectype1),
4806                                                  bitsize_int (0)));
4807               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4808               dst2 =  make_ssa_name (vectype1);
4809               epilog_stmt
4810                   = gimple_build_assign (dst2, BIT_FIELD_REF,
4811                                          build3 (BIT_FIELD_REF, vectype1,
4812                                                  new_temp, TYPE_SIZE (vectype1),
4813                                                  bitsize_int (sz * BITS_PER_UNIT)));
4814               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4815             }
4816           else
4817             {
4818               /* Extract via punning to appropriately sized integer mode
4819                  vector.  */
4820               tree eltype = build_nonstandard_integer_type (sz * BITS_PER_UNIT,
4821                                                             1);
4822               tree etype = build_vector_type (eltype, 2);
4823               gcc_assert (convert_optab_handler (vec_extract_optab,
4824                                                  TYPE_MODE (etype),
4825                                                  TYPE_MODE (eltype))
4826                           != CODE_FOR_nothing);
4827               tree tem = make_ssa_name (etype);
4828               epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
4829                                                  build1 (VIEW_CONVERT_EXPR,
4830                                                          etype, new_temp));
4831               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4832               new_temp = tem;
4833               tem = make_ssa_name (eltype);
4834               epilog_stmt
4835                   = gimple_build_assign (tem, BIT_FIELD_REF,
4836                                          build3 (BIT_FIELD_REF, eltype,
4837                                                  new_temp, TYPE_SIZE (eltype),
4838                                                  bitsize_int (0)));
4839               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4840               dst1 = make_ssa_name (vectype1);
4841               epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
4842                                                  build1 (VIEW_CONVERT_EXPR,
4843                                                          vectype1, tem));
4844               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4845               tem = make_ssa_name (eltype);
4846               epilog_stmt
4847                   = gimple_build_assign (tem, BIT_FIELD_REF,
4848                                          build3 (BIT_FIELD_REF, eltype,
4849                                                  new_temp, TYPE_SIZE (eltype),
4850                                                  bitsize_int (sz * BITS_PER_UNIT)));
4851               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4852               dst2 =  make_ssa_name (vectype1);
4853               epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
4854                                                  build1 (VIEW_CONVERT_EXPR,
4855                                                          vectype1, tem));
4856               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4857             }
4858
4859           new_temp = make_ssa_name (vectype1);
4860           epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
4861           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4862         }
4863
4864       if (reduce_with_shift && !slp_reduc)
4865         {
4866           int element_bitsize = tree_to_uhwi (bitsize);
4867           /* Enforced by vectorizable_reduction, which disallows SLP reductions
4868              for variable-length vectors and also requires direct target support
4869              for loop reductions.  */
4870           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
4871           int nelements = vec_size_in_bits / element_bitsize;
4872           vec_perm_builder sel;
4873           vec_perm_indices indices;
4874
4875           int elt_offset;
4876
4877           tree zero_vec = build_zero_cst (vectype1);
4878           /* Case 2: Create:
4879              for (offset = nelements/2; offset >= 1; offset/=2)
4880                 {
4881                   Create:  va' = vec_shift <va, offset>
4882                   Create:  va = vop <va, va'>
4883                 }  */
4884
4885           tree rhs;
4886
4887           if (dump_enabled_p ())
4888             dump_printf_loc (MSG_NOTE, vect_location,
4889                              "Reduce using vector shifts\n");
4890
4891           vec_dest = vect_create_destination_var (scalar_dest, vectype1);
4892           for (elt_offset = nelements / 2;
4893                elt_offset >= 1;
4894                elt_offset /= 2)
4895             {
4896               calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
4897               indices.new_vector (sel, 2, nelements);
4898               tree mask = vect_gen_perm_mask_any (vectype1, indices);
4899               epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
4900                                                  new_temp, zero_vec, mask);
4901               new_name = make_ssa_name (vec_dest, epilog_stmt);
4902               gimple_assign_set_lhs (epilog_stmt, new_name);
4903               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4904
4905               epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
4906                                                  new_temp);
4907               new_temp = make_ssa_name (vec_dest, epilog_stmt);
4908               gimple_assign_set_lhs (epilog_stmt, new_temp);
4909               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4910             }
4911
4912           /* 2.4  Extract the final scalar result.  Create:
4913              s_out3 = extract_field <v_out2, bitpos>  */
4914
4915           if (dump_enabled_p ())
4916             dump_printf_loc (MSG_NOTE, vect_location,
4917                              "extract scalar result\n");
4918
4919           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
4920                         bitsize, bitsize_zero_node);
4921           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4922           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4923           gimple_assign_set_lhs (epilog_stmt, new_temp);
4924           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4925           scalar_results.safe_push (new_temp);
4926         }
4927       else
4928         {
4929           /* Case 3: Create:
4930              s = extract_field <v_out2, 0>
4931              for (offset = element_size;
4932                   offset < vector_size;
4933                   offset += element_size;)
4934                {
4935                  Create:  s' = extract_field <v_out2, offset>
4936                  Create:  s = op <s, s'>  // For non SLP cases
4937                }  */
4938
4939           if (dump_enabled_p ())
4940             dump_printf_loc (MSG_NOTE, vect_location,
4941                              "Reduce using scalar code.\n");
4942
4943           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
4944           int element_bitsize = tree_to_uhwi (bitsize);
4945           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
4946             {
4947               int bit_offset;
4948               if (gimple_code (new_phi) == GIMPLE_PHI)
4949                 vec_temp = PHI_RESULT (new_phi);
4950               else
4951                 vec_temp = gimple_assign_lhs (new_phi);
4952               tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
4953                                  bitsize_zero_node);
4954               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4955               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4956               gimple_assign_set_lhs (epilog_stmt, new_temp);
4957               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4958
4959               /* In SLP we don't need to apply reduction operation, so we just
4960                  collect s' values in SCALAR_RESULTS.  */
4961               if (slp_reduc)
4962                 scalar_results.safe_push (new_temp);
4963
4964               for (bit_offset = element_bitsize;
4965                    bit_offset < vec_size_in_bits;
4966                    bit_offset += element_bitsize)
4967                 {
4968                   tree bitpos = bitsize_int (bit_offset);
4969                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
4970                                      bitsize, bitpos);
4971
4972                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4973                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
4974                   gimple_assign_set_lhs (epilog_stmt, new_name);
4975                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4976
4977                   if (slp_reduc)
4978                     {
4979                       /* In SLP we don't need to apply reduction operation, so
4980                          we just collect s' values in SCALAR_RESULTS.  */
4981                       new_temp = new_name;
4982                       scalar_results.safe_push (new_name);
4983                     }
4984                   else
4985                     {
4986                       epilog_stmt = gimple_build_assign (new_scalar_dest, code,
4987                                                          new_name, new_temp);
4988                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4989                       gimple_assign_set_lhs (epilog_stmt, new_temp);
4990                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4991                     }
4992                 }
4993             }
4994
4995           /* The only case where we need to reduce scalar results in SLP, is
4996              unrolling.  If the size of SCALAR_RESULTS is greater than
4997              REDUC_GROUP_SIZE, we reduce them combining elements modulo
4998              REDUC_GROUP_SIZE.  */
4999           if (slp_reduc)
5000             {
5001               tree res, first_res, new_res;
5002               gimple *new_stmt;
5003
5004               /* Reduce multiple scalar results in case of SLP unrolling.  */
5005               for (j = group_size; scalar_results.iterate (j, &res);
5006                    j++)
5007                 {
5008                   first_res = scalar_results[j % group_size];
5009                   new_stmt = gimple_build_assign (new_scalar_dest, code,
5010                                                   first_res, res);
5011                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
5012                   gimple_assign_set_lhs (new_stmt, new_res);
5013                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
5014                   scalar_results[j % group_size] = new_res;
5015                 }
5016             }
5017           else
5018             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
5019             scalar_results.safe_push (new_temp);
5020         }
5021
5022       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5023           && induc_val)
5024         {
5025           /* Earlier we set the initial value to be a vector if induc_val
5026              values.  Check the result and if it is induc_val then replace
5027              with the original initial value, unless induc_val is
5028              the same as initial_def already.  */
5029           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5030                                   induc_val);
5031
5032           tree tmp = make_ssa_name (new_scalar_dest);
5033           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5034                                              initial_def, new_temp);
5035           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5036           scalar_results[0] = tmp;
5037         }
5038     }
5039
5040   /* 2.5 Adjust the final result by the initial value of the reduction
5041          variable. (When such adjustment is not needed, then
5042          'adjustment_def' is zero).  For example, if code is PLUS we create:
5043          new_temp = loop_exit_def + adjustment_def  */
5044
5045   if (adjustment_def)
5046     {
5047       gcc_assert (!slp_reduc);
5048       if (nested_in_vect_loop)
5049         {
5050           new_phi = new_phis[0];
5051           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
5052           expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
5053           new_dest = vect_create_destination_var (scalar_dest, vectype);
5054         }
5055       else
5056         {
5057           new_temp = scalar_results[0];
5058           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5059           expr = build2 (code, scalar_type, new_temp, adjustment_def);
5060           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
5061         }
5062
5063       epilog_stmt = gimple_build_assign (new_dest, expr);
5064       new_temp = make_ssa_name (new_dest, epilog_stmt);
5065       gimple_assign_set_lhs (epilog_stmt, new_temp);
5066       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5067       if (nested_in_vect_loop)
5068         {
5069           stmt_vec_info epilog_stmt_info = loop_vinfo->add_stmt (epilog_stmt);
5070           STMT_VINFO_RELATED_STMT (epilog_stmt_info)
5071             = STMT_VINFO_RELATED_STMT (loop_vinfo->lookup_stmt (new_phi));
5072
5073           if (!double_reduc)
5074             scalar_results.quick_push (new_temp);
5075           else
5076             scalar_results[0] = new_temp;
5077         }
5078       else
5079         scalar_results[0] = new_temp;
5080
5081       new_phis[0] = epilog_stmt;
5082     }
5083
5084   if (double_reduc)
5085     loop = loop->inner;
5086
5087   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
5088           phis with new adjusted scalar results, i.e., replace use <s_out0>
5089           with use <s_out4>.
5090
5091      Transform:
5092         loop_exit:
5093           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5094           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5095           v_out2 = reduce <v_out1>
5096           s_out3 = extract_field <v_out2, 0>
5097           s_out4 = adjust_result <s_out3>
5098           use <s_out0>
5099           use <s_out0>
5100
5101      into:
5102
5103         loop_exit:
5104           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5105           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5106           v_out2 = reduce <v_out1>
5107           s_out3 = extract_field <v_out2, 0>
5108           s_out4 = adjust_result <s_out3>
5109           use <s_out4>
5110           use <s_out4> */
5111
5112
5113   /* In SLP reduction chain we reduce vector results into one vector if
5114      necessary, hence we set here REDUC_GROUP_SIZE to 1.  SCALAR_DEST is the
5115      LHS of the last stmt in the reduction chain, since we are looking for
5116      the loop exit phi node.  */
5117   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5118     {
5119       stmt_vec_info dest_stmt_info
5120         = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
5121       scalar_dest = gimple_assign_lhs (dest_stmt_info->stmt);
5122       group_size = 1;
5123     }
5124
5125   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5126      case that REDUC_GROUP_SIZE is greater than vectorization factor).
5127      Therefore, we need to match SCALAR_RESULTS with corresponding statements.
5128      The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results
5129      correspond to the first vector stmt, etc.
5130      (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)).  */
5131   if (group_size > new_phis.length ())
5132     gcc_assert (!(group_size % new_phis.length ()));
5133
5134   for (k = 0; k < group_size; k++)
5135     {
5136       if (slp_reduc)
5137         {
5138           stmt_vec_info scalar_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5139
5140           orig_stmt_info = STMT_VINFO_RELATED_STMT (scalar_stmt_info);
5141           /* SLP statements can't participate in patterns.  */
5142           gcc_assert (!orig_stmt_info);
5143           scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt);
5144         }
5145
5146       if (nested_in_vect_loop)
5147         {
5148           if (double_reduc)
5149             loop = outer_loop;
5150           else
5151             gcc_unreachable ();
5152         }
5153
5154       phis.create (3);
5155       /* Find the loop-closed-use at the loop exit of the original scalar
5156          result.  (The reduction result is expected to have two immediate uses,
5157          one at the latch block, and one at the loop exit).  For double
5158          reductions we are looking for exit phis of the outer loop.  */
5159       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5160         {
5161           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5162             {
5163               if (!is_gimple_debug (USE_STMT (use_p)))
5164                 phis.safe_push (USE_STMT (use_p));
5165             }
5166           else
5167             {
5168               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5169                 {
5170                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
5171
5172                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5173                     {
5174                       if (!flow_bb_inside_loop_p (loop,
5175                                              gimple_bb (USE_STMT (phi_use_p)))
5176                           && !is_gimple_debug (USE_STMT (phi_use_p)))
5177                         phis.safe_push (USE_STMT (phi_use_p));
5178                     }
5179                 }
5180             }
5181         }
5182
5183       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5184         {
5185           /* Replace the uses:  */
5186           orig_name = PHI_RESULT (exit_phi);
5187           scalar_result = scalar_results[k];
5188           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5189             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5190               SET_USE (use_p, scalar_result);
5191         }
5192
5193       phis.release ();
5194     }
5195 }
5196
5197 /* Return a vector of type VECTYPE that is equal to the vector select
5198    operation "MASK ? VEC : IDENTITY".  Insert the select statements
5199    before GSI.  */
5200
5201 static tree
5202 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
5203                      tree vec, tree identity)
5204 {
5205   tree cond = make_temp_ssa_name (vectype, NULL, "cond");
5206   gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
5207                                           mask, vec, identity);
5208   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5209   return cond;
5210 }
5211
5212 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
5213    order, starting with LHS.  Insert the extraction statements before GSI and
5214    associate the new scalar SSA names with variable SCALAR_DEST.
5215    Return the SSA name for the result.  */
5216
5217 static tree
5218 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
5219                        tree_code code, tree lhs, tree vector_rhs)
5220 {
5221   tree vectype = TREE_TYPE (vector_rhs);
5222   tree scalar_type = TREE_TYPE (vectype);
5223   tree bitsize = TYPE_SIZE (scalar_type);
5224   unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5225   unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
5226
5227   for (unsigned HOST_WIDE_INT bit_offset = 0;
5228        bit_offset < vec_size_in_bits;
5229        bit_offset += element_bitsize)
5230     {
5231       tree bitpos = bitsize_int (bit_offset);
5232       tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
5233                          bitsize, bitpos);
5234
5235       gassign *stmt = gimple_build_assign (scalar_dest, rhs);
5236       rhs = make_ssa_name (scalar_dest, stmt);
5237       gimple_assign_set_lhs (stmt, rhs);
5238       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5239
5240       stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
5241       tree new_name = make_ssa_name (scalar_dest, stmt);
5242       gimple_assign_set_lhs (stmt, new_name);
5243       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5244       lhs = new_name;
5245     }
5246   return lhs;
5247 }
5248
5249 /* Get a masked internal function equivalent to REDUC_FN.  VECTYPE_IN is the
5250    type of the vector input.  */
5251
5252 static internal_fn
5253 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
5254 {
5255   internal_fn mask_reduc_fn;
5256
5257   switch (reduc_fn)
5258     {
5259     case IFN_FOLD_LEFT_PLUS:
5260       mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
5261       break;
5262
5263     default:
5264       return IFN_LAST;
5265     }
5266
5267   if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
5268                                       OPTIMIZE_FOR_SPEED))
5269     return mask_reduc_fn;
5270   return IFN_LAST;
5271 }
5272
5273 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT_INFO is the
5274    statement that sets the live-out value.  REDUC_DEF_STMT is the phi
5275    statement.  CODE is the operation performed by STMT_INFO and OPS are
5276    its scalar operands.  REDUC_INDEX is the index of the operand in
5277    OPS that is set by REDUC_DEF_STMT.  REDUC_FN is the function that
5278    implements in-order reduction, or IFN_LAST if we should open-code it.
5279    VECTYPE_IN is the type of the vector input.  MASKS specifies the masks
5280    that should be used to control the operation in a fully-masked loop.  */
5281
5282 static bool
5283 vectorize_fold_left_reduction (stmt_vec_info stmt_info,
5284                                gimple_stmt_iterator *gsi,
5285                                stmt_vec_info *vec_stmt, slp_tree slp_node,
5286                                gimple *reduc_def_stmt,
5287                                tree_code code, internal_fn reduc_fn,
5288                                tree ops[3], tree vectype_in,
5289                                int reduc_index, vec_loop_masks *masks)
5290 {
5291   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5292   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5293   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5294   stmt_vec_info new_stmt_info = NULL;
5295   internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
5296
5297   int ncopies;
5298   if (slp_node)
5299     ncopies = 1;
5300   else
5301     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
5302
5303   gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
5304   gcc_assert (ncopies == 1);
5305   gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
5306
5307   if (slp_node)
5308     gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
5309                           TYPE_VECTOR_SUBPARTS (vectype_in)));
5310
5311   tree op0 = ops[1 - reduc_index];
5312
5313   int group_size = 1;
5314   stmt_vec_info scalar_dest_def_info;
5315   auto_vec<tree> vec_oprnds0;
5316   if (slp_node)
5317     {
5318       auto_vec<vec<tree> > vec_defs (2);
5319       vect_get_slp_defs (slp_node, &vec_defs);
5320       vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
5321       vec_defs[0].release ();
5322       vec_defs[1].release ();
5323       group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
5324       scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5325     }
5326   else
5327     {
5328       tree loop_vec_def0 = vect_get_vec_def_for_operand (op0, stmt_info);
5329       vec_oprnds0.create (1);
5330       vec_oprnds0.quick_push (loop_vec_def0);
5331       scalar_dest_def_info = stmt_info;
5332     }
5333
5334   tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
5335   tree scalar_type = TREE_TYPE (scalar_dest);
5336   tree reduc_var = gimple_phi_result (reduc_def_stmt);
5337
5338   int vec_num = vec_oprnds0.length ();
5339   gcc_assert (vec_num == 1 || slp_node);
5340   tree vec_elem_type = TREE_TYPE (vectype_out);
5341   gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
5342
5343   tree vector_identity = NULL_TREE;
5344   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5345     vector_identity = build_zero_cst (vectype_out);
5346
5347   tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
5348   int i;
5349   tree def0;
5350   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5351     {
5352       gimple *new_stmt;
5353       tree mask = NULL_TREE;
5354       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5355         mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
5356
5357       /* Handle MINUS by adding the negative.  */
5358       if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
5359         {
5360           tree negated = make_ssa_name (vectype_out);
5361           new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
5362           gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5363           def0 = negated;
5364         }
5365
5366       if (mask && mask_reduc_fn == IFN_LAST)
5367         def0 = merge_with_identity (gsi, mask, vectype_out, def0,
5368                                     vector_identity);
5369
5370       /* On the first iteration the input is simply the scalar phi
5371          result, and for subsequent iterations it is the output of
5372          the preceding operation.  */
5373       if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
5374         {
5375           if (mask && mask_reduc_fn != IFN_LAST)
5376             new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
5377                                                    def0, mask);
5378           else
5379             new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
5380                                                    def0);
5381           /* For chained SLP reductions the output of the previous reduction
5382              operation serves as the input of the next. For the final statement
5383              the output cannot be a temporary - we reuse the original
5384              scalar destination of the last statement.  */
5385           if (i != vec_num - 1)
5386             {
5387               gimple_set_lhs (new_stmt, scalar_dest_var);
5388               reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
5389               gimple_set_lhs (new_stmt, reduc_var);
5390             }
5391         }
5392       else
5393         {
5394           reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
5395                                              reduc_var, def0);
5396           new_stmt = SSA_NAME_DEF_STMT (reduc_var);
5397           /* Remove the statement, so that we can use the same code paths
5398              as for statements that we've just created.  */
5399           gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
5400           gsi_remove (&tmp_gsi, true);
5401         }
5402
5403       if (i == vec_num - 1)
5404         {
5405           gimple_set_lhs (new_stmt, scalar_dest);
5406           new_stmt_info = vect_finish_replace_stmt (scalar_dest_def_info,
5407                                                     new_stmt);
5408         }
5409       else
5410         new_stmt_info = vect_finish_stmt_generation (scalar_dest_def_info,
5411                                                      new_stmt, gsi);
5412
5413       if (slp_node)
5414         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
5415     }
5416
5417   if (!slp_node)
5418     STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
5419
5420   return true;
5421 }
5422
5423 /* Function is_nonwrapping_integer_induction.
5424
5425    Check if STMT_VINO (which is part of loop LOOP) both increments and
5426    does not cause overflow.  */
5427
5428 static bool
5429 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
5430 {
5431   gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
5432   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
5433   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
5434   tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
5435   widest_int ni, max_loop_value, lhs_max;
5436   wi::overflow_type overflow = wi::OVF_NONE;
5437
5438   /* Make sure the loop is integer based.  */
5439   if (TREE_CODE (base) != INTEGER_CST
5440       || TREE_CODE (step) != INTEGER_CST)
5441     return false;
5442
5443   /* Check that the max size of the loop will not wrap.  */
5444
5445   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
5446     return true;
5447
5448   if (! max_stmt_executions (loop, &ni))
5449     return false;
5450
5451   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
5452                             &overflow);
5453   if (overflow)
5454     return false;
5455
5456   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
5457                             TYPE_SIGN (lhs_type), &overflow);
5458   if (overflow)
5459     return false;
5460
5461   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
5462           <= TYPE_PRECISION (lhs_type));
5463 }
5464
5465 /* Check if masking can be supported by inserting a conditional expression.
5466    CODE is the code for the operation.  COND_FN is the conditional internal
5467    function, if it exists.  VECTYPE_IN is the type of the vector input.  */
5468 static bool
5469 use_mask_by_cond_expr_p (enum tree_code code, internal_fn cond_fn,
5470                          tree vectype_in)
5471 {
5472   if (cond_fn != IFN_LAST
5473       && direct_internal_fn_supported_p (cond_fn, vectype_in,
5474                                          OPTIMIZE_FOR_SPEED))
5475     return false;
5476
5477   switch (code)
5478     {
5479     case DOT_PROD_EXPR:
5480     case SAD_EXPR:
5481       return true;
5482
5483     default:
5484       return false;
5485     }
5486 }
5487
5488 /* Insert a conditional expression to enable masked vectorization.  CODE is the
5489    code for the operation.  VOP is the array of operands.  MASK is the loop
5490    mask.  GSI is a statement iterator used to place the new conditional
5491    expression.  */
5492 static void
5493 build_vect_cond_expr (enum tree_code code, tree vop[3], tree mask,
5494                       gimple_stmt_iterator *gsi)
5495 {
5496   switch (code)
5497     {
5498     case DOT_PROD_EXPR:
5499       {
5500         tree vectype = TREE_TYPE (vop[1]);
5501         tree zero = build_zero_cst (vectype);
5502         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
5503         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
5504                                                mask, vop[1], zero);
5505         gsi_insert_before (gsi, select, GSI_SAME_STMT);
5506         vop[1] = masked_op1;
5507         break;
5508       }
5509
5510     case SAD_EXPR:
5511       {
5512         tree vectype = TREE_TYPE (vop[1]);
5513         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
5514         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
5515                                                mask, vop[1], vop[0]);
5516         gsi_insert_before (gsi, select, GSI_SAME_STMT);
5517         vop[1] = masked_op1;
5518         break;
5519       }
5520
5521     default:
5522       gcc_unreachable ();
5523     }
5524 }
5525
5526 /* Function vectorizable_reduction.
5527
5528    Check if STMT_INFO performs a reduction operation that can be vectorized.
5529    If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
5530    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
5531    Return true if STMT_INFO is vectorizable in this way.
5532
5533    This function also handles reduction idioms (patterns) that have been
5534    recognized in advance during vect_pattern_recog.  In this case, STMT_INFO
5535    may be of this form:
5536      X = pattern_expr (arg0, arg1, ..., X)
5537    and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
5538    sequence that had been detected and replaced by the pattern-stmt
5539    (STMT_INFO).
5540
5541    This function also handles reduction of condition expressions, for example:
5542      for (int i = 0; i < N; i++)
5543        if (a[i] < value)
5544          last = a[i];
5545    This is handled by vectorising the loop and creating an additional vector
5546    containing the loop indexes for which "a[i] < value" was true.  In the
5547    function epilogue this is reduced to a single max value and then used to
5548    index into the vector of results.
5549
5550    In some cases of reduction patterns, the type of the reduction variable X is
5551    different than the type of the other arguments of STMT_INFO.
5552    In such cases, the vectype that is used when transforming STMT_INFO into
5553    a vector stmt is different than the vectype that is used to determine the
5554    vectorization factor, because it consists of a different number of elements
5555    than the actual number of elements that are being operated upon in parallel.
5556
5557    For example, consider an accumulation of shorts into an int accumulator.
5558    On some targets it's possible to vectorize this pattern operating on 8
5559    shorts at a time (hence, the vectype for purposes of determining the
5560    vectorization factor should be V8HI); on the other hand, the vectype that
5561    is used to create the vector form is actually V4SI (the type of the result).
5562
5563    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
5564    indicates what is the actual level of parallelism (V8HI in the example), so
5565    that the right vectorization factor would be derived.  This vectype
5566    corresponds to the type of arguments to the reduction stmt, and should *NOT*
5567    be used to create the vectorized stmt.  The right vectype for the vectorized
5568    stmt is obtained from the type of the result X:
5569       get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
5570
5571    This means that, contrary to "regular" reductions (or "regular" stmts in
5572    general), the following equation:
5573       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
5574    does *NOT* necessarily hold for reduction patterns.  */
5575
5576 bool
5577 vectorizable_reduction (stmt_vec_info stmt_info, slp_tree slp_node,
5578                         slp_instance slp_node_instance,
5579                         stmt_vector_for_cost *cost_vec)
5580 {
5581   tree scalar_dest;
5582   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5583   tree vectype_in = NULL_TREE;
5584   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5585   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5586   enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
5587   stmt_vec_info cond_stmt_vinfo = NULL;
5588   tree scalar_type;
5589   int i;
5590   int ncopies;
5591   bool single_defuse_cycle = false;
5592   bool nested_cycle = false;
5593   bool double_reduc = false;
5594   int vec_num;
5595   tree tem;
5596   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
5597   tree cond_reduc_val = NULL_TREE;
5598
5599   /* Make sure it was already recognized as a reduction computation.  */
5600   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
5601       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
5602       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
5603     return false;
5604
5605   /* The stmt we store reduction analysis meta on.  */
5606   stmt_vec_info reduc_info = info_for_reduction (stmt_info);
5607   reduc_info->is_reduc_info = true;
5608
5609   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
5610     {
5611       if (is_a <gphi *> (stmt_info->stmt))
5612         /* Analysis for double-reduction is done on the outer
5613            loop PHI, nested cycles have no further restrictions.  */
5614         STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
5615       else
5616         STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
5617       return true;
5618     }
5619
5620   stmt_vec_info orig_stmt_of_analysis = stmt_info;
5621   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
5622       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5623     {
5624       if (!is_a <gphi *> (stmt_info->stmt))
5625         {
5626           STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
5627           return true;
5628         }
5629       if (slp_node)
5630         {
5631           slp_node_instance->reduc_phis = slp_node;
5632           /* ???  We're leaving slp_node to point to the PHIs, we only
5633              need it to get at the number of vector stmts which wasn't
5634              yet initialized for the instance root.  */
5635         }
5636       if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
5637         stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (stmt_info));
5638       else /* STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def */
5639         {
5640           use_operand_p use_p;
5641           gimple *use_stmt;
5642           bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
5643                                      &use_p, &use_stmt);
5644           gcc_assert (res);
5645           stmt_info = loop_vinfo->lookup_stmt (use_stmt);
5646           stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (stmt_info));
5647         }
5648       /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
5649          element.  */
5650       if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5651         {
5652           gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
5653           stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
5654         }
5655     }
5656
5657   if (nested_in_vect_loop_p (loop, stmt_info))
5658     {
5659       loop = loop->inner;
5660       nested_cycle = true;
5661     }
5662
5663   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5664     gcc_assert (slp_node
5665                 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
5666
5667   /* 1. Is vectorizable reduction?  */
5668   /* Not supportable if the reduction variable is used in the loop, unless
5669      it's a reduction chain.  */
5670   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
5671       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5672     return false;
5673
5674   /* Reductions that are not used even in an enclosing outer-loop,
5675      are expected to be "live" (used out of the loop).  */
5676   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
5677       && !STMT_VINFO_LIVE_P (stmt_info))
5678     return false;
5679
5680   /* 2. Has this been recognized as a reduction pattern?
5681
5682      Check if STMT represents a pattern that has been recognized
5683      in earlier analysis stages.  For stmts that represent a pattern,
5684      the STMT_VINFO_RELATED_STMT field records the last stmt in
5685      the original sequence that constitutes the pattern.  */
5686
5687   stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
5688   if (orig_stmt_info)
5689     {
5690       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
5691       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
5692     }
5693
5694   /* 3. Check the operands of the operation.  The first operands are defined
5695         inside the loop body. The last operand is the reduction variable,
5696         which is defined by the loop-header-phi.  */
5697
5698   gassign *stmt = as_a <gassign *> (stmt_info->stmt);
5699   enum tree_code code = gimple_assign_rhs_code (stmt);
5700   bool lane_reduc_code_p
5701     = (code == DOT_PROD_EXPR || code == WIDEN_SUM_EXPR || code == SAD_EXPR);
5702   int op_type = TREE_CODE_LENGTH (code);
5703
5704   scalar_dest = gimple_assign_lhs (stmt);
5705   scalar_type = TREE_TYPE (scalar_dest);
5706   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
5707       && !SCALAR_FLOAT_TYPE_P (scalar_type))
5708     return false;
5709
5710   /* Do not try to vectorize bit-precision reductions.  */
5711   if (!type_has_mode_precision_p (scalar_type))
5712     return false;
5713
5714   /* All uses but the last are expected to be defined in the loop.
5715      The last use is the reduction variable.  In case of nested cycle this
5716      assumption is not true: we use reduc_index to record the index of the
5717      reduction variable.  */
5718   stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
5719   /* PHIs should not participate in patterns.  */
5720   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
5721   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
5722
5723   /* Verify following REDUC_IDX from the latch def leads us back to the PHI.  */
5724   tree reduc_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
5725                                           loop_latch_edge (loop));
5726   while (reduc_def != PHI_RESULT (reduc_def_phi))
5727     {
5728       stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
5729       def = vect_stmt_to_vectorize (def);
5730       gcc_assert (STMT_VINFO_REDUC_IDX (def) != -1);
5731       reduc_def = gimple_op (def->stmt, 1 + STMT_VINFO_REDUC_IDX (def));
5732     }
5733
5734   reduc_def = PHI_RESULT (reduc_def_phi);
5735   int reduc_index = -1;
5736   for (i = 0; i < op_type; i++)
5737     {
5738       tree op = gimple_op (stmt, i + 1);
5739       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
5740       if (i == 0 && code == COND_EXPR)
5741         continue;
5742
5743       stmt_vec_info def_stmt_info;
5744       enum vect_def_type dt;
5745       if (!vect_is_simple_use (op, loop_vinfo, &dt, &tem,
5746                                &def_stmt_info))
5747         {
5748           if (dump_enabled_p ())
5749             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5750                              "use not simple.\n");
5751           return false;
5752         }
5753       if ((dt == vect_reduction_def || dt == vect_nested_cycle)
5754           && op == reduc_def)
5755         {
5756           reduc_index = i;
5757           continue;
5758         }
5759
5760       /* There should be only one cycle def in the stmt, the one
5761          leading to reduc_def.  */
5762       if (VECTORIZABLE_CYCLE_DEF (dt))
5763         return false;
5764
5765       /* To properly compute ncopies we are interested in the widest
5766          non-reduction input type in case we're looking at a widening
5767          accumulation that we later handle in vect_transform_reduction.  */
5768       if (lane_reduc_code_p
5769           && tem
5770           && (!vectype_in
5771               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
5772                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem))))))
5773         vectype_in = tem;
5774
5775       if (code == COND_EXPR)
5776         {
5777           /* Record how the non-reduction-def value of COND_EXPR is defined.  */
5778           if (dt == vect_constant_def)
5779             {
5780               cond_reduc_dt = dt;
5781               cond_reduc_val = op;
5782             }
5783           if (dt == vect_induction_def
5784               && def_stmt_info
5785               && is_nonwrapping_integer_induction (def_stmt_info, loop))
5786             {
5787               cond_reduc_dt = dt;
5788               cond_stmt_vinfo = def_stmt_info;
5789             }
5790         }
5791     }
5792   if (!vectype_in)
5793     vectype_in = vectype_out;
5794   STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
5795   /* For the SSA cycle we store on each participating stmt the operand index
5796      where the cycle continues.  Store the one relevant for the actual
5797      operation in the reduction meta.  */
5798   STMT_VINFO_REDUC_IDX (reduc_info) = reduc_index;
5799
5800   enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
5801   STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
5802   /* If we have a condition reduction, see if we can simplify it further.  */
5803   if (v_reduc_type == COND_REDUCTION)
5804     {
5805       if (slp_node)
5806         return false;
5807
5808       /* TODO: We can't yet handle reduction chains, since we need to treat
5809          each COND_EXPR in the chain specially, not just the last one.
5810          E.g. for:
5811
5812             x_1 = PHI <x_3, ...>
5813             x_2 = a_2 ? ... : x_1;
5814             x_3 = a_3 ? ... : x_2;
5815
5816          we're interested in the last element in x_3 for which a_2 || a_3
5817          is true, whereas the current reduction chain handling would
5818          vectorize x_2 as a normal VEC_COND_EXPR and only treat x_3
5819          as a reduction operation.  */
5820       if (reduc_index == -1)
5821         {
5822           if (dump_enabled_p ())
5823             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5824                              "conditional reduction chains not supported\n");
5825           return false;
5826         }
5827
5828       /* When the condition uses the reduction value in the condition, fail.  */
5829       if (reduc_index == 0)
5830         {
5831           if (dump_enabled_p ())
5832             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5833                              "condition depends on previous iteration\n");
5834           return false;
5835         }
5836
5837       if (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
5838                                           vectype_in, OPTIMIZE_FOR_SPEED))
5839         {
5840           if (dump_enabled_p ())
5841             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5842                              "optimizing condition reduction with"
5843                              " FOLD_EXTRACT_LAST.\n");
5844           STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
5845         }
5846       else if (cond_reduc_dt == vect_induction_def)
5847         {
5848           tree base
5849             = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
5850           tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
5851
5852           gcc_assert (TREE_CODE (base) == INTEGER_CST
5853                       && TREE_CODE (step) == INTEGER_CST);
5854           cond_reduc_val = NULL_TREE;
5855           enum tree_code cond_reduc_op_code = ERROR_MARK;
5856           tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
5857           if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
5858             ;
5859           /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
5860              above base; punt if base is the minimum value of the type for
5861              MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
5862           else if (tree_int_cst_sgn (step) == -1)
5863             {
5864               cond_reduc_op_code = MIN_EXPR;
5865               if (tree_int_cst_sgn (base) == -1)
5866                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
5867               else if (tree_int_cst_lt (base,
5868                                         TYPE_MAX_VALUE (TREE_TYPE (base))))
5869                 cond_reduc_val
5870                   = int_const_binop (PLUS_EXPR, base, integer_one_node);
5871             }
5872           else
5873             {
5874               cond_reduc_op_code = MAX_EXPR;
5875               if (tree_int_cst_sgn (base) == 1)
5876                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
5877               else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
5878                                         base))
5879                 cond_reduc_val
5880                   = int_const_binop (MINUS_EXPR, base, integer_one_node);
5881             }
5882           if (cond_reduc_val)
5883             {
5884               if (dump_enabled_p ())
5885                 dump_printf_loc (MSG_NOTE, vect_location,
5886                                  "condition expression based on "
5887                                  "integer induction.\n");
5888               STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
5889               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
5890                 = cond_reduc_val;
5891               STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
5892             }
5893         }
5894       else if (cond_reduc_dt == vect_constant_def)
5895         {
5896           enum vect_def_type cond_initial_dt;
5897           tree cond_initial_val
5898             = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi, loop_preheader_edge (loop));
5899
5900           gcc_assert (cond_reduc_val != NULL_TREE);
5901           vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
5902           if (cond_initial_dt == vect_constant_def
5903               && types_compatible_p (TREE_TYPE (cond_initial_val),
5904                                      TREE_TYPE (cond_reduc_val)))
5905             {
5906               tree e = fold_binary (LE_EXPR, boolean_type_node,
5907                                     cond_initial_val, cond_reduc_val);
5908               if (e && (integer_onep (e) || integer_zerop (e)))
5909                 {
5910                   if (dump_enabled_p ())
5911                     dump_printf_loc (MSG_NOTE, vect_location,
5912                                      "condition expression based on "
5913                                      "compile time constant.\n");
5914                   /* Record reduction code at analysis stage.  */
5915                   STMT_VINFO_REDUC_CODE (reduc_info)
5916                     = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
5917                   STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
5918                 }
5919             }
5920         }
5921     }
5922
5923   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5924     /* We changed STMT to be the first stmt in reduction chain, hence we
5925        check that in this case the first element in the chain is STMT.  */
5926     gcc_assert (REDUC_GROUP_FIRST_ELEMENT (STMT_VINFO_REDUC_DEF (phi_info))
5927                 == vect_orig_stmt (stmt_info));
5928
5929   if (STMT_VINFO_LIVE_P (phi_info))
5930     return false;
5931
5932   if (slp_node)
5933     ncopies = 1;
5934   else
5935     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
5936
5937   gcc_assert (ncopies >= 1);
5938
5939   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
5940
5941   if (nested_cycle)
5942     {
5943       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
5944                   == vect_double_reduction_def);
5945       double_reduc = true;
5946     }
5947
5948   /* 4.2. Check support for the epilog operation.
5949
5950           If STMT represents a reduction pattern, then the type of the
5951           reduction variable may be different than the type of the rest
5952           of the arguments.  For example, consider the case of accumulation
5953           of shorts into an int accumulator; The original code:
5954                         S1: int_a = (int) short_a;
5955           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
5956
5957           was replaced with:
5958                         STMT: int_acc = widen_sum <short_a, int_acc>
5959
5960           This means that:
5961           1. The tree-code that is used to create the vector operation in the
5962              epilog code (that reduces the partial results) is not the
5963              tree-code of STMT, but is rather the tree-code of the original
5964              stmt from the pattern that STMT is replacing.  I.e, in the example
5965              above we want to use 'widen_sum' in the loop, but 'plus' in the
5966              epilog.
5967           2. The type (mode) we use to check available target support
5968              for the vector operation to be created in the *epilog*, is
5969              determined by the type of the reduction variable (in the example
5970              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
5971              However the type (mode) we use to check available target support
5972              for the vector operation to be created *inside the loop*, is
5973              determined by the type of the other arguments to STMT (in the
5974              example we'd check this: optab_handler (widen_sum_optab,
5975              vect_short_mode)).
5976
5977           This is contrary to "regular" reductions, in which the types of all
5978           the arguments are the same as the type of the reduction variable.
5979           For "regular" reductions we can therefore use the same vector type
5980           (and also the same tree-code) when generating the epilog code and
5981           when generating the code inside the loop.  */
5982
5983   enum tree_code orig_code = STMT_VINFO_REDUC_CODE (phi_info);
5984   STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
5985
5986   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
5987   if (reduction_type == TREE_CODE_REDUCTION)
5988     {
5989       /* Check whether it's ok to change the order of the computation.
5990          Generally, when vectorizing a reduction we change the order of the
5991          computation.  This may change the behavior of the program in some
5992          cases, so we need to check that this is ok.  One exception is when
5993          vectorizing an outer-loop: the inner-loop is executed sequentially,
5994          and therefore vectorizing reductions in the inner-loop during
5995          outer-loop vectorization is safe.  */
5996       if (needs_fold_left_reduction_p (scalar_type, orig_code))
5997         {
5998           STMT_VINFO_REDUC_TYPE (reduc_info)
5999             = reduction_type = FOLD_LEFT_REDUCTION;
6000           /* When vectorizing a reduction chain w/o SLP the reduction PHI is not
6001              directy used in stmt.  */
6002           if (reduc_index == -1)
6003             {
6004               if (dump_enabled_p ())
6005                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6006                                  "in-order reduction chain without SLP.\n");
6007               return false;
6008             }
6009         }
6010       else if (!commutative_tree_code (orig_code)
6011                || !associative_tree_code (orig_code))
6012         {
6013           if (dump_enabled_p ())
6014             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6015                             "reduction: not commutative/associative");
6016           return false;
6017         }
6018     }
6019
6020   if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
6021       && ncopies > 1)
6022     {
6023       if (dump_enabled_p ())
6024         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6025                          "multiple types in double reduction or condition "
6026                          "reduction or fold-left reduction.\n");
6027       return false;
6028     }
6029
6030   internal_fn reduc_fn = IFN_LAST;
6031   if (reduction_type == TREE_CODE_REDUCTION
6032       || reduction_type == FOLD_LEFT_REDUCTION
6033       || reduction_type == INTEGER_INDUC_COND_REDUCTION
6034       || reduction_type == CONST_COND_REDUCTION)
6035     {
6036       if (reduction_type == FOLD_LEFT_REDUCTION
6037           ? fold_left_reduction_fn (orig_code, &reduc_fn)
6038           : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6039         {
6040           if (reduc_fn != IFN_LAST
6041               && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6042                                                   OPTIMIZE_FOR_SPEED))
6043             {
6044               if (dump_enabled_p ())
6045                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6046                                  "reduc op not supported by target.\n");
6047
6048               reduc_fn = IFN_LAST;
6049             }
6050         }
6051       else
6052         {
6053           if (!nested_cycle || double_reduc)
6054             {
6055               if (dump_enabled_p ())
6056                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6057                                  "no reduc code for scalar code.\n");
6058
6059               return false;
6060             }
6061         }
6062     }
6063   else if (reduction_type == COND_REDUCTION)
6064     {
6065       int scalar_precision
6066         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6067       cr_index_scalar_type = make_unsigned_type (scalar_precision);
6068       cr_index_vector_type = build_vector_type (cr_index_scalar_type,
6069                                                 nunits_out);
6070
6071       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
6072                                           OPTIMIZE_FOR_SPEED))
6073         reduc_fn = IFN_REDUC_MAX;
6074     }
6075   STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
6076
6077   if (reduction_type != EXTRACT_LAST_REDUCTION
6078       && (!nested_cycle || double_reduc)
6079       && reduc_fn == IFN_LAST
6080       && !nunits_out.is_constant ())
6081     {
6082       if (dump_enabled_p ())
6083         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6084                          "missing target support for reduction on"
6085                          " variable-length vectors.\n");
6086       return false;
6087     }
6088
6089   /* For SLP reductions, see if there is a neutral value we can use.  */
6090   tree neutral_op = NULL_TREE;
6091   if (slp_node)
6092     neutral_op = neutral_op_for_slp_reduction
6093       (slp_node_instance->reduc_phis, orig_code,
6094        REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL);
6095
6096   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
6097     {
6098       /* We can't support in-order reductions of code such as this:
6099
6100            for (int i = 0; i < n1; ++i)
6101              for (int j = 0; j < n2; ++j)
6102                l += a[j];
6103
6104          since GCC effectively transforms the loop when vectorizing:
6105
6106            for (int i = 0; i < n1 / VF; ++i)
6107              for (int j = 0; j < n2; ++j)
6108                for (int k = 0; k < VF; ++k)
6109                  l += a[j];
6110
6111          which is a reassociation of the original operation.  */
6112       if (dump_enabled_p ())
6113         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6114                          "in-order double reduction not supported.\n");
6115
6116       return false;
6117     }
6118
6119   if (reduction_type == FOLD_LEFT_REDUCTION
6120       && slp_node
6121       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6122     {
6123       /* We cannot use in-order reductions in this case because there is
6124          an implicit reassociation of the operations involved.  */
6125       if (dump_enabled_p ())
6126         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6127                          "in-order unchained SLP reductions not supported.\n");
6128       return false;
6129     }
6130
6131   /* For double reductions, and for SLP reductions with a neutral value,
6132      we construct a variable-length initial vector by loading a vector
6133      full of the neutral value and then shift-and-inserting the start
6134      values into the low-numbered elements.  */
6135   if ((double_reduc || neutral_op)
6136       && !nunits_out.is_constant ()
6137       && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
6138                                           vectype_out, OPTIMIZE_FOR_SPEED))
6139     {
6140       if (dump_enabled_p ())
6141         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6142                          "reduction on variable-length vectors requires"
6143                          " target support for a vector-shift-and-insert"
6144                          " operation.\n");
6145       return false;
6146     }
6147
6148   /* Check extra constraints for variable-length unchained SLP reductions.  */
6149   if (STMT_SLP_TYPE (stmt_info)
6150       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6151       && !nunits_out.is_constant ())
6152     {
6153       /* We checked above that we could build the initial vector when
6154          there's a neutral element value.  Check here for the case in
6155          which each SLP statement has its own initial value and in which
6156          that value needs to be repeated for every instance of the
6157          statement within the initial vector.  */
6158       unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
6159       scalar_mode elt_mode = SCALAR_TYPE_MODE (TREE_TYPE (vectype_out));
6160       if (!neutral_op
6161           && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
6162                                               elt_mode))
6163         {
6164           if (dump_enabled_p ())
6165             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6166                              "unsupported form of SLP reduction for"
6167                              " variable-length vectors: cannot build"
6168                              " initial vector.\n");
6169           return false;
6170         }
6171       /* The epilogue code relies on the number of elements being a multiple
6172          of the group size.  The duplicate-and-interleave approach to setting
6173          up the the initial vector does too.  */
6174       if (!multiple_p (nunits_out, group_size))
6175         {
6176           if (dump_enabled_p ())
6177             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6178                              "unsupported form of SLP reduction for"
6179                              " variable-length vectors: the vector size"
6180                              " is not a multiple of the number of results.\n");
6181           return false;
6182         }
6183     }
6184
6185   if (reduction_type == COND_REDUCTION)
6186     {
6187       widest_int ni;
6188
6189       if (! max_loop_iterations (loop, &ni))
6190         {
6191           if (dump_enabled_p ())
6192             dump_printf_loc (MSG_NOTE, vect_location,
6193                              "loop count not known, cannot create cond "
6194                              "reduction.\n");
6195           return false;
6196         }
6197       /* Convert backedges to iterations.  */
6198       ni += 1;
6199
6200       /* The additional index will be the same type as the condition.  Check
6201          that the loop can fit into this less one (because we'll use up the
6202          zero slot for when there are no matches).  */
6203       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
6204       if (wi::geu_p (ni, wi::to_widest (max_index)))
6205         {
6206           if (dump_enabled_p ())
6207             dump_printf_loc (MSG_NOTE, vect_location,
6208                              "loop size is greater than data size.\n");
6209           return false;
6210         }
6211     }
6212
6213   /* In case the vectorization factor (VF) is bigger than the number
6214      of elements that we can fit in a vectype (nunits), we have to generate
6215      more than one vector stmt - i.e - we need to "unroll" the
6216      vector stmt by a factor VF/nunits.  For more details see documentation
6217      in vectorizable_operation.  */
6218
6219   /* If the reduction is used in an outer loop we need to generate
6220      VF intermediate results, like so (e.g. for ncopies=2):
6221         r0 = phi (init, r0)
6222         r1 = phi (init, r1)
6223         r0 = x0 + r0;
6224         r1 = x1 + r1;
6225     (i.e. we generate VF results in 2 registers).
6226     In this case we have a separate def-use cycle for each copy, and therefore
6227     for each copy we get the vector def for the reduction variable from the
6228     respective phi node created for this copy.
6229
6230     Otherwise (the reduction is unused in the loop nest), we can combine
6231     together intermediate results, like so (e.g. for ncopies=2):
6232         r = phi (init, r)
6233         r = x0 + r;
6234         r = x1 + r;
6235    (i.e. we generate VF/2 results in a single register).
6236    In this case for each copy we get the vector def for the reduction variable
6237    from the vectorized reduction operation generated in the previous iteration.
6238
6239    This only works when we see both the reduction PHI and its only consumer
6240    in vectorizable_reduction and there are no intermediate stmts
6241    participating.  */
6242   stmt_vec_info use_stmt_info;
6243   tree reduc_phi_result = gimple_phi_result (reduc_def_phi);
6244   if (ncopies > 1
6245       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
6246       && (use_stmt_info = loop_vinfo->lookup_single_use (reduc_phi_result))
6247       && (!STMT_VINFO_IN_PATTERN_P (use_stmt_info)
6248           || !STMT_VINFO_PATTERN_DEF_SEQ (use_stmt_info))
6249       && vect_stmt_to_vectorize (use_stmt_info) == stmt_info)
6250     single_defuse_cycle = true;
6251
6252   if (single_defuse_cycle || lane_reduc_code_p)
6253     {
6254       gcc_assert (code != COND_EXPR);
6255
6256       /* 4. Supportable by target?  */
6257       bool ok = true;
6258
6259       /* 4.1. check support for the operation in the loop  */
6260       optab optab = optab_for_tree_code (code, vectype_in, optab_vector);
6261       if (!optab)
6262         {
6263           if (dump_enabled_p ())
6264             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6265                              "no optab.\n");
6266           ok = false;
6267         }
6268
6269       machine_mode vec_mode = TYPE_MODE (vectype_in);
6270       if (ok && optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6271         {
6272           if (dump_enabled_p ())
6273             dump_printf (MSG_NOTE, "op not supported by target.\n");
6274           if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
6275               || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6276             ok = false;
6277           else
6278             if (dump_enabled_p ())
6279               dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6280         }
6281
6282       /* Worthwhile without SIMD support?  */
6283       if (ok
6284           && !VECTOR_MODE_P (TYPE_MODE (vectype_in))
6285           && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6286         {
6287           if (dump_enabled_p ())
6288             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6289                              "not worthwhile without SIMD support.\n");
6290           ok = false;
6291         }
6292
6293       /* lane-reducing operations have to go through vect_transform_reduction.
6294          For the other cases try without the single cycle optimization.  */
6295       if (!ok)
6296         {
6297           if (lane_reduc_code_p)
6298             return false;
6299           else
6300             single_defuse_cycle = false;
6301         }
6302     }
6303   STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
6304
6305   /* If the reduction stmt is one of the patterns that have lane
6306      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
6307   if ((ncopies > 1 && ! single_defuse_cycle)
6308       && lane_reduc_code_p)
6309     {
6310       if (dump_enabled_p ())
6311         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6312                          "multi def-use cycle not possible for lane-reducing "
6313                          "reduction operation\n");
6314       return false;
6315     }
6316
6317   if (slp_node)
6318     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6319   else
6320     vec_num = 1;
6321
6322   vect_model_reduction_cost (stmt_info, reduc_fn, reduction_type, ncopies,
6323                              cost_vec);
6324   if (dump_enabled_p ()
6325       && reduction_type == FOLD_LEFT_REDUCTION)
6326     dump_printf_loc (MSG_NOTE, vect_location,
6327                      "using an in-order (fold-left) reduction.\n");
6328   STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
6329   /* All but single defuse-cycle optimized, lane-reducing and fold-left
6330      reductions go through their own vectorizable_* routines.  */
6331   if (!single_defuse_cycle
6332       && code != DOT_PROD_EXPR
6333       && code != WIDEN_SUM_EXPR
6334       && code != SAD_EXPR
6335       && reduction_type != FOLD_LEFT_REDUCTION)
6336     {
6337       STMT_VINFO_DEF_TYPE (stmt_info) = vect_internal_def;
6338       STMT_VINFO_DEF_TYPE (vect_orig_stmt (stmt_info)) = vect_internal_def;
6339     }
6340   else if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
6341     {
6342       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
6343       internal_fn cond_fn = get_conditional_internal_fn (code);
6344
6345       if (reduction_type != FOLD_LEFT_REDUCTION
6346           && !use_mask_by_cond_expr_p (code, cond_fn, vectype_in)
6347           && (cond_fn == IFN_LAST
6348               || !direct_internal_fn_supported_p (cond_fn, vectype_in,
6349                                                   OPTIMIZE_FOR_SPEED)))
6350         {
6351           if (dump_enabled_p ())
6352             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6353                              "can't use a fully-masked loop because no"
6354                              " conditional operation is available.\n");
6355           LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
6356         }
6357       else
6358         vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
6359                                vectype_in, NULL);
6360     }
6361   return true;
6362 }
6363
6364 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
6365    value.  */
6366
6367 bool
6368 vect_transform_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
6369                           stmt_vec_info *vec_stmt, slp_tree slp_node)
6370 {
6371   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6372   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6373   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6374   int i;
6375   int ncopies;
6376   int j;
6377   int vec_num;
6378
6379   stmt_vec_info reduc_info = info_for_reduction (stmt_info);
6380   gcc_assert (reduc_info->is_reduc_info);
6381
6382   if (nested_in_vect_loop_p (loop, stmt_info))
6383     {
6384       loop = loop->inner;
6385       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
6386     }
6387
6388   gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6389   enum tree_code code = gimple_assign_rhs_code (stmt);
6390   int op_type = TREE_CODE_LENGTH (code);
6391
6392   /* Flatten RHS.  */
6393   tree ops[3];
6394   switch (get_gimple_rhs_class (code))
6395     {
6396     case GIMPLE_TERNARY_RHS:
6397       ops[2] = gimple_assign_rhs3 (stmt);
6398       /* Fall thru.  */
6399     case GIMPLE_BINARY_RHS:
6400       ops[0] = gimple_assign_rhs1 (stmt);
6401       ops[1] = gimple_assign_rhs2 (stmt);
6402       break;
6403     default:
6404       gcc_unreachable ();
6405     }
6406
6407   /* All uses but the last are expected to be defined in the loop.
6408      The last use is the reduction variable.  In case of nested cycle this
6409      assumption is not true: we use reduc_index to record the index of the
6410      reduction variable.  */
6411   stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
6412   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
6413   int reduc_index = STMT_VINFO_REDUC_IDX (reduc_info);
6414   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
6415
6416   if (slp_node)
6417     {
6418       ncopies = 1;
6419       vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6420     }
6421   else
6422     {
6423       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6424       vec_num = 1;
6425     }
6426
6427   internal_fn cond_fn = get_conditional_internal_fn (code);
6428   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
6429   bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
6430
6431   /* Transform.  */
6432   stmt_vec_info new_stmt_info = NULL;
6433   stmt_vec_info prev_stmt_info;
6434   tree new_temp = NULL_TREE;
6435   auto_vec<tree> vec_oprnds0;
6436   auto_vec<tree> vec_oprnds1;
6437   auto_vec<tree> vec_oprnds2;
6438   tree def0;
6439
6440   if (dump_enabled_p ())
6441     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
6442
6443   /* FORNOW: Multiple types are not supported for condition.  */
6444   if (code == COND_EXPR)
6445     gcc_assert (ncopies == 1);
6446
6447   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
6448
6449   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
6450   if (reduction_type == FOLD_LEFT_REDUCTION)
6451     {
6452       internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
6453       return vectorize_fold_left_reduction
6454           (stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, code,
6455            reduc_fn, ops, vectype_in, reduc_index, masks);
6456     }
6457
6458   bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
6459   gcc_assert (single_defuse_cycle
6460               || code == DOT_PROD_EXPR
6461               || code == WIDEN_SUM_EXPR
6462               || code == SAD_EXPR);
6463
6464   /* Create the destination vector  */
6465   tree scalar_dest = gimple_assign_lhs (stmt);
6466   tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6467
6468   prev_stmt_info = NULL;
6469   if (!slp_node)
6470     {
6471       vec_oprnds0.create (1);
6472       vec_oprnds1.create (1);
6473       if (op_type == ternary_op)
6474         vec_oprnds2.create (1);
6475     }
6476
6477   for (j = 0; j < ncopies; j++)
6478     {
6479       /* Handle uses.  */
6480       if (j == 0)
6481         {
6482           if (slp_node)
6483             {
6484               /* Get vec defs for all the operands except the reduction index,
6485                  ensuring the ordering of the ops in the vector is kept.  */
6486               auto_vec<vec<tree>, 3> vec_defs;
6487               vect_get_slp_defs (slp_node, &vec_defs);
6488               vec_oprnds0.safe_splice (vec_defs[0]);
6489               vec_defs[0].release ();
6490               vec_oprnds1.safe_splice (vec_defs[1]);
6491               vec_defs[1].release ();
6492               if (op_type == ternary_op)
6493                 {
6494                   vec_oprnds2.safe_splice (vec_defs[2]);
6495                   vec_defs[2].release ();
6496                 }
6497             }
6498           else
6499             {
6500               vec_oprnds0.quick_push
6501                 (vect_get_vec_def_for_operand (ops[0], stmt_info));
6502               vec_oprnds1.quick_push
6503                 (vect_get_vec_def_for_operand (ops[1], stmt_info));
6504               if (op_type == ternary_op)
6505                 vec_oprnds2.quick_push
6506                   (vect_get_vec_def_for_operand (ops[2], stmt_info));
6507             }
6508         }
6509       else
6510         {
6511           if (!slp_node)
6512             {
6513               gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
6514
6515               if (single_defuse_cycle && reduc_index == 0)
6516                 vec_oprnds0[0] = gimple_get_lhs (new_stmt_info->stmt);
6517               else
6518                 vec_oprnds0[0]
6519                   = vect_get_vec_def_for_stmt_copy (loop_vinfo,
6520                                                     vec_oprnds0[0]);
6521               if (single_defuse_cycle && reduc_index == 1)
6522                 vec_oprnds1[0] = gimple_get_lhs (new_stmt_info->stmt);
6523               else
6524                 vec_oprnds1[0]
6525                   = vect_get_vec_def_for_stmt_copy (loop_vinfo,
6526                                                     vec_oprnds1[0]);
6527               if (op_type == ternary_op)
6528                 {
6529                   if (single_defuse_cycle && reduc_index == 2)
6530                     vec_oprnds2[0] = gimple_get_lhs (new_stmt_info->stmt);
6531                   else
6532                     vec_oprnds2[0]
6533                       = vect_get_vec_def_for_stmt_copy (loop_vinfo,
6534                                                         vec_oprnds2[0]);
6535                 }
6536             }
6537         }
6538
6539       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6540         {
6541           tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
6542           if (masked_loop_p && !mask_by_cond_expr)
6543             {
6544               /* Make sure that the reduction accumulator is vop[0].  */
6545               if (reduc_index == 1)
6546                 {
6547                   gcc_assert (commutative_tree_code (code));
6548                   std::swap (vop[0], vop[1]);
6549                 }
6550               tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
6551                                               vectype_in, i * ncopies + j);
6552               gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
6553                                                         vop[0], vop[1],
6554                                                         vop[0]);
6555               new_temp = make_ssa_name (vec_dest, call);
6556               gimple_call_set_lhs (call, new_temp);
6557               gimple_call_set_nothrow (call, true);
6558               new_stmt_info
6559                 = vect_finish_stmt_generation (stmt_info, call, gsi);
6560             }
6561           else
6562             {
6563               if (op_type == ternary_op)
6564                 vop[2] = vec_oprnds2[i];
6565
6566               if (masked_loop_p && mask_by_cond_expr)
6567                 {
6568                   tree mask = vect_get_loop_mask (gsi, masks,
6569                                                   vec_num * ncopies,
6570                                                   vectype_in, i * ncopies + j);
6571                   build_vect_cond_expr (code, vop, mask, gsi);
6572                 }
6573
6574               gassign *new_stmt = gimple_build_assign (vec_dest, code,
6575                                                        vop[0], vop[1], vop[2]);
6576               new_temp = make_ssa_name (vec_dest, new_stmt);
6577               gimple_assign_set_lhs (new_stmt, new_temp);
6578               new_stmt_info
6579                 = vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
6580             }
6581
6582           if (slp_node)
6583             SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
6584         }
6585
6586       if (slp_node || single_defuse_cycle)
6587         continue;
6588
6589       if (j == 0)
6590         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
6591       else
6592         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
6593
6594       prev_stmt_info = new_stmt_info;
6595     }
6596
6597   if (single_defuse_cycle && !slp_node)
6598     STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
6599
6600   return true;
6601 }
6602
6603 /* Transform phase of a cycle PHI.  */
6604
6605 bool
6606 vect_transform_cycle_phi (stmt_vec_info stmt_info, stmt_vec_info *vec_stmt,
6607                           slp_tree slp_node, slp_instance slp_node_instance)
6608 {
6609   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6610   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6611   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6612   int i;
6613   int ncopies;
6614   stmt_vec_info prev_phi_info;
6615   int j;
6616   bool nested_cycle = false;
6617   int vec_num;
6618
6619   if (nested_in_vect_loop_p (loop, stmt_info))
6620     {
6621       loop = loop->inner;
6622       nested_cycle = true;
6623     }
6624
6625   stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
6626   reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
6627   stmt_vec_info reduc_info = info_for_reduction (stmt_info);
6628   gcc_assert (reduc_info->is_reduc_info);
6629
6630   if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
6631       || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
6632     /* Leave the scalar phi in place.  */
6633     return true;
6634
6635   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
6636   /* For a nested cycle we do not fill the above.  */
6637   if (!vectype_in)
6638     vectype_in = STMT_VINFO_VECTYPE (stmt_info);
6639   gcc_assert (vectype_in);
6640
6641   if (slp_node)
6642     {
6643       /* The size vect_schedule_slp_instance computes is off for us.  */
6644       vec_num = vect_get_num_vectors
6645           (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6646            * SLP_TREE_SCALAR_STMTS (slp_node).length (), vectype_in);
6647       ncopies = 1;
6648     }
6649   else
6650     {
6651       vec_num = 1;
6652       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6653     }
6654
6655   /* Check whether we should use a single PHI node and accumulate
6656      vectors to one before the backedge.  */
6657   if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
6658     ncopies = 1;
6659
6660   /* Create the destination vector  */
6661   gphi *phi = as_a <gphi *> (stmt_info->stmt);
6662   tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
6663                                                vectype_out);
6664
6665   /* Get the loop-entry arguments.  */
6666   tree vec_initial_def;
6667   auto_vec<tree> vec_initial_defs;
6668   if (slp_node)
6669     {
6670       vec_initial_defs.reserve (vec_num);
6671       gcc_assert (slp_node == slp_node_instance->reduc_phis);
6672       stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info);
6673       tree neutral_op
6674         = neutral_op_for_slp_reduction (slp_node,
6675                                         STMT_VINFO_REDUC_CODE (reduc_info),
6676                                         first != NULL);
6677       get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
6678                                       &vec_initial_defs, vec_num,
6679                                       first != NULL, neutral_op);
6680     }
6681   else
6682     {
6683       /* Get at the scalar def before the loop, that defines the initial
6684          value of the reduction variable.  */
6685       tree initial_def = PHI_ARG_DEF_FROM_EDGE (phi,
6686                                                 loop_preheader_edge (loop));
6687       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
6688          and we can't use zero for induc_val, use initial_def.  Similarly
6689          for REDUC_MIN and initial_def larger than the base.  */
6690       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6691         {
6692           tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
6693           if (TREE_CODE (initial_def) == INTEGER_CST
6694               && !integer_zerop (induc_val)
6695               && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
6696                    && tree_int_cst_lt (initial_def, induc_val))
6697                   || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
6698                       && tree_int_cst_lt (induc_val, initial_def))))
6699             {
6700               induc_val = initial_def;
6701               /* Communicate we used the initial_def to epilouge
6702                  generation.  */
6703               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
6704             }
6705           vec_initial_def = build_vector_from_val (vectype_out, induc_val);
6706         }
6707       else if (nested_cycle)
6708         {
6709           /* Do not use an adjustment def as that case is not supported
6710              correctly if ncopies is not one.  */
6711           vec_initial_def = vect_get_vec_def_for_operand (initial_def,
6712                                                           reduc_stmt_info);
6713         }
6714       else
6715         {
6716           tree adjustment_def = NULL_TREE;
6717           tree *adjustment_defp = &adjustment_def;
6718           enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
6719           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
6720             adjustment_defp = NULL;
6721           vec_initial_def
6722             = get_initial_def_for_reduction (reduc_stmt_info, code,
6723                                              initial_def, adjustment_defp);
6724           STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = adjustment_def;
6725         }
6726       vec_initial_defs.create (1);
6727       vec_initial_defs.quick_push (vec_initial_def);
6728     }
6729
6730   /* Generate the reduction PHIs upfront.  */
6731   prev_phi_info = NULL;
6732   for (i = 0; i < vec_num; i++)
6733     {
6734       tree vec_init_def = vec_initial_defs[i];
6735       for (j = 0; j < ncopies; j++)
6736         {
6737           /* Create the reduction-phi that defines the reduction
6738              operand.  */
6739           gphi *new_phi = create_phi_node (vec_dest, loop->header);
6740           stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
6741
6742           /* Set the loop-entry arg of the reduction-phi.  */
6743           if (j != 0 && nested_cycle)
6744             vec_init_def = vect_get_vec_def_for_stmt_copy (loop_vinfo,
6745                                                            vec_init_def);
6746           add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
6747                        UNKNOWN_LOCATION);
6748
6749           /* The loop-latch arg is set in epilogue processing.  */
6750
6751           if (slp_node)
6752             SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi_info);
6753           else
6754             {
6755               if (j == 0)
6756                 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi_info;
6757               else
6758                 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi_info;
6759               prev_phi_info = new_phi_info;
6760             }
6761         }
6762     }
6763
6764   return true;
6765 }
6766
6767 /* Vectorizes LC PHIs.  */
6768
6769 bool
6770 vectorizable_lc_phi (stmt_vec_info stmt_info, stmt_vec_info *vec_stmt,
6771                      slp_tree slp_node)
6772 {
6773   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6774   if (!loop_vinfo
6775       || !is_a <gphi *> (stmt_info->stmt)
6776       || gimple_phi_num_args (stmt_info->stmt) != 1)
6777     return false;
6778
6779   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
6780       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
6781     return false;
6782
6783   if (!vec_stmt) /* transformation not required.  */
6784     {
6785       STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
6786       return true;
6787     }
6788
6789   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6790   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
6791   basic_block bb = gimple_bb (stmt_info->stmt);
6792   edge e = single_pred_edge (bb);
6793   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
6794   vec<tree> vec_oprnds = vNULL;
6795   vect_get_vec_defs (gimple_phi_arg_def (stmt_info->stmt, 0), NULL_TREE,
6796                      stmt_info, &vec_oprnds, NULL, slp_node);
6797   if (slp_node)
6798     {
6799       unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6800       gcc_assert (vec_oprnds.length () == vec_num);
6801       for (unsigned i = 0; i < vec_num; i++)
6802         {
6803           /* Create the vectorized LC PHI node.  */
6804           gphi *new_phi = create_phi_node (vec_dest, bb);
6805           add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
6806           stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
6807           SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi_info);
6808         }
6809     }
6810   else
6811     {
6812       unsigned ncopies = vect_get_num_copies (loop_vinfo, vectype);
6813       stmt_vec_info prev_phi_info = NULL;
6814       for (unsigned i = 0; i < ncopies; i++)
6815         {
6816           if (i != 0)
6817             vect_get_vec_defs_for_stmt_copy (loop_vinfo, &vec_oprnds, NULL);
6818           /* Create the vectorized LC PHI node.  */
6819           gphi *new_phi = create_phi_node (vec_dest, bb);
6820           add_phi_arg (new_phi, vec_oprnds[0], e, UNKNOWN_LOCATION);
6821           stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
6822           if (i == 0)
6823             STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi_info;
6824           else
6825             STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi_info;
6826           prev_phi_info = new_phi_info;
6827         }
6828     }
6829   vec_oprnds.release ();
6830
6831   return true;
6832 }
6833
6834
6835 /* Function vect_min_worthwhile_factor.
6836
6837    For a loop where we could vectorize the operation indicated by CODE,
6838    return the minimum vectorization factor that makes it worthwhile
6839    to use generic vectors.  */
6840 static unsigned int
6841 vect_min_worthwhile_factor (enum tree_code code)
6842 {
6843   switch (code)
6844     {
6845     case PLUS_EXPR:
6846     case MINUS_EXPR:
6847     case NEGATE_EXPR:
6848       return 4;
6849
6850     case BIT_AND_EXPR:
6851     case BIT_IOR_EXPR:
6852     case BIT_XOR_EXPR:
6853     case BIT_NOT_EXPR:
6854       return 2;
6855
6856     default:
6857       return INT_MAX;
6858     }
6859 }
6860
6861 /* Return true if VINFO indicates we are doing loop vectorization and if
6862    it is worth decomposing CODE operations into scalar operations for
6863    that loop's vectorization factor.  */
6864
6865 bool
6866 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
6867 {
6868   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6869   unsigned HOST_WIDE_INT value;
6870   return (loop_vinfo
6871           && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
6872           && value >= vect_min_worthwhile_factor (code));
6873 }
6874
6875 /* Function vectorizable_induction
6876
6877    Check if STMT_INFO performs an induction computation that can be vectorized.
6878    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
6879    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
6880    Return true if STMT_INFO is vectorizable in this way.  */
6881
6882 bool
6883 vectorizable_induction (stmt_vec_info stmt_info,
6884                         gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
6885                         stmt_vec_info *vec_stmt, slp_tree slp_node,
6886                         stmt_vector_for_cost *cost_vec)
6887 {
6888   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6889   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6890   unsigned ncopies;
6891   bool nested_in_vect_loop = false;
6892   class loop *iv_loop;
6893   tree vec_def;
6894   edge pe = loop_preheader_edge (loop);
6895   basic_block new_bb;
6896   tree new_vec, vec_init, vec_step, t;
6897   tree new_name;
6898   gimple *new_stmt;
6899   gphi *induction_phi;
6900   tree induc_def, vec_dest;
6901   tree init_expr, step_expr;
6902   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
6903   unsigned i;
6904   tree expr;
6905   gimple_seq stmts;
6906   imm_use_iterator imm_iter;
6907   use_operand_p use_p;
6908   gimple *exit_phi;
6909   edge latch_e;
6910   tree loop_arg;
6911   gimple_stmt_iterator si;
6912
6913   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
6914   if (!phi)
6915     return false;
6916
6917   if (!STMT_VINFO_RELEVANT_P (stmt_info))
6918     return false;
6919
6920   /* Make sure it was recognized as induction computation.  */
6921   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
6922     return false;
6923
6924   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6925   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
6926
6927   if (slp_node)
6928     ncopies = 1;
6929   else
6930     ncopies = vect_get_num_copies (loop_vinfo, vectype);
6931   gcc_assert (ncopies >= 1);
6932
6933   /* FORNOW. These restrictions should be relaxed.  */
6934   if (nested_in_vect_loop_p (loop, stmt_info))
6935     {
6936       imm_use_iterator imm_iter;
6937       use_operand_p use_p;
6938       gimple *exit_phi;
6939       edge latch_e;
6940       tree loop_arg;
6941
6942       if (ncopies > 1)
6943         {
6944           if (dump_enabled_p ())
6945             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6946                              "multiple types in nested loop.\n");
6947           return false;
6948         }
6949
6950       /* FORNOW: outer loop induction with SLP not supported.  */
6951       if (STMT_SLP_TYPE (stmt_info))
6952         return false;
6953
6954       exit_phi = NULL;
6955       latch_e = loop_latch_edge (loop->inner);
6956       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
6957       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
6958         {
6959           gimple *use_stmt = USE_STMT (use_p);
6960           if (is_gimple_debug (use_stmt))
6961             continue;
6962
6963           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
6964             {
6965               exit_phi = use_stmt;
6966               break;
6967             }
6968         }
6969       if (exit_phi)
6970         {
6971           stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
6972           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
6973                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
6974             {
6975               if (dump_enabled_p ())
6976                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6977                                  "inner-loop induction only used outside "
6978                                  "of the outer vectorized loop.\n");
6979               return false;
6980             }
6981         }
6982
6983       nested_in_vect_loop = true;
6984       iv_loop = loop->inner;
6985     }
6986   else
6987     iv_loop = loop;
6988   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
6989
6990   if (slp_node && !nunits.is_constant ())
6991     {
6992       /* The current SLP code creates the initial value element-by-element.  */
6993       if (dump_enabled_p ())
6994         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6995                          "SLP induction not supported for variable-length"
6996                          " vectors.\n");
6997       return false;
6998     }
6999
7000   if (!vec_stmt) /* transformation not required.  */
7001     {
7002       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
7003       DUMP_VECT_SCOPE ("vectorizable_induction");
7004       vect_model_induction_cost (stmt_info, ncopies, cost_vec);
7005       return true;
7006     }
7007
7008   /* Transform.  */
7009
7010   /* Compute a vector variable, initialized with the first VF values of
7011      the induction variable.  E.g., for an iv with IV_PHI='X' and
7012      evolution S, for a vector of 4 units, we want to compute:
7013      [X, X + S, X + 2*S, X + 3*S].  */
7014
7015   if (dump_enabled_p ())
7016     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
7017
7018   latch_e = loop_latch_edge (iv_loop);
7019   loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7020
7021   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
7022   gcc_assert (step_expr != NULL_TREE);
7023   tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
7024
7025   pe = loop_preheader_edge (iv_loop);
7026   init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
7027                                      loop_preheader_edge (iv_loop));
7028
7029   stmts = NULL;
7030   if (!nested_in_vect_loop)
7031     {
7032       /* Convert the initial value to the IV update type.  */
7033       tree new_type = TREE_TYPE (step_expr);
7034       init_expr = gimple_convert (&stmts, new_type, init_expr);
7035
7036       /* If we are using the loop mask to "peel" for alignment then we need
7037          to adjust the start value here.  */
7038       tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
7039       if (skip_niters != NULL_TREE)
7040         {
7041           if (FLOAT_TYPE_P (vectype))
7042             skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
7043                                         skip_niters);
7044           else
7045             skip_niters = gimple_convert (&stmts, new_type, skip_niters);
7046           tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
7047                                          skip_niters, step_expr);
7048           init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
7049                                     init_expr, skip_step);
7050         }
7051     }
7052
7053   if (stmts)
7054     {
7055       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7056       gcc_assert (!new_bb);
7057     }
7058
7059   /* Find the first insertion point in the BB.  */
7060   basic_block bb = gimple_bb (phi);
7061   si = gsi_after_labels (bb);
7062
7063   /* For SLP induction we have to generate several IVs as for example
7064      with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
7065      [i + 2*S, i + 3*S, i + 3*S, i + 3*S].  The step is the same uniform
7066      [VF*S, VF*S, VF*S, VF*S] for all.  */
7067   if (slp_node)
7068     {
7069       /* Enforced above.  */
7070       unsigned int const_nunits = nunits.to_constant ();
7071
7072       /* Generate [VF*S, VF*S, ... ].  */
7073       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7074         {
7075           expr = build_int_cst (integer_type_node, vf);
7076           expr = fold_convert (TREE_TYPE (step_expr), expr);
7077         }
7078       else
7079         expr = build_int_cst (TREE_TYPE (step_expr), vf);
7080       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7081                               expr, step_expr);
7082       if (! CONSTANT_CLASS_P (new_name))
7083         new_name = vect_init_vector (stmt_info, new_name,
7084                                      TREE_TYPE (step_expr), NULL);
7085       new_vec = build_vector_from_val (step_vectype, new_name);
7086       vec_step = vect_init_vector (stmt_info, new_vec, step_vectype, NULL);
7087
7088       /* Now generate the IVs.  */
7089       unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7090       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7091       unsigned elts = const_nunits * nvects;
7092       unsigned nivs = least_common_multiple (group_size,
7093                                              const_nunits) / const_nunits;
7094       gcc_assert (elts % group_size == 0);
7095       tree elt = init_expr;
7096       unsigned ivn;
7097       for (ivn = 0; ivn < nivs; ++ivn)
7098         {
7099           tree_vector_builder elts (step_vectype, const_nunits, 1);
7100           stmts = NULL;
7101           for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
7102             {
7103               if (ivn*const_nunits + eltn >= group_size
7104                   && (ivn * const_nunits + eltn) % group_size == 0)
7105                 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
7106                                     elt, step_expr);
7107               elts.quick_push (elt);
7108             }
7109           vec_init = gimple_build_vector (&stmts, &elts);
7110           vec_init = gimple_convert (&stmts, vectype, vec_init);
7111           if (stmts)
7112             {
7113               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7114               gcc_assert (!new_bb);
7115             }
7116
7117           /* Create the induction-phi that defines the induction-operand.  */
7118           vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7119           induction_phi = create_phi_node (vec_dest, iv_loop->header);
7120           stmt_vec_info induction_phi_info
7121             = loop_vinfo->add_stmt (induction_phi);
7122           induc_def = PHI_RESULT (induction_phi);
7123
7124           /* Create the iv update inside the loop  */
7125           gimple_seq stmts = NULL;
7126           vec_def = gimple_convert (&stmts, step_vectype, induc_def);
7127           vec_def = gimple_build (&stmts,
7128                                   PLUS_EXPR, step_vectype, vec_def, vec_step);
7129           vec_def = gimple_convert (&stmts, vectype, vec_def);
7130           loop_vinfo->add_stmt (SSA_NAME_DEF_STMT (vec_def));
7131           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
7132
7133           /* Set the arguments of the phi node:  */
7134           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7135           add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7136                        UNKNOWN_LOCATION);
7137
7138           SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi_info);
7139         }
7140
7141       /* Re-use IVs when we can.  */
7142       if (ivn < nvects)
7143         {
7144           unsigned vfp
7145             = least_common_multiple (group_size, const_nunits) / group_size;
7146           /* Generate [VF'*S, VF'*S, ... ].  */
7147           if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7148             {
7149               expr = build_int_cst (integer_type_node, vfp);
7150               expr = fold_convert (TREE_TYPE (step_expr), expr);
7151             }
7152           else
7153             expr = build_int_cst (TREE_TYPE (step_expr), vfp);
7154           new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7155                                   expr, step_expr);
7156           if (! CONSTANT_CLASS_P (new_name))
7157             new_name = vect_init_vector (stmt_info, new_name,
7158                                          TREE_TYPE (step_expr), NULL);
7159           new_vec = build_vector_from_val (step_vectype, new_name);
7160           vec_step = vect_init_vector (stmt_info, new_vec, step_vectype, NULL);
7161           for (; ivn < nvects; ++ivn)
7162             {
7163               gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs]->stmt;
7164               tree def;
7165               if (gimple_code (iv) == GIMPLE_PHI)
7166                 def = gimple_phi_result (iv);
7167               else
7168                 def = gimple_assign_lhs (iv);
7169               gimple_seq stmts = NULL;
7170               def = gimple_convert (&stmts, step_vectype, def);
7171               def = gimple_build (&stmts,
7172                                   PLUS_EXPR, step_vectype, def, vec_step);
7173               def = gimple_convert (&stmts, vectype, def);
7174               if (gimple_code (iv) == GIMPLE_PHI)
7175                 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
7176               else
7177                 {
7178                   gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
7179                   gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
7180                 }
7181               SLP_TREE_VEC_STMTS (slp_node).quick_push
7182                 (loop_vinfo->add_stmt (SSA_NAME_DEF_STMT (def)));
7183             }
7184         }
7185
7186       return true;
7187     }
7188
7189   /* Create the vector that holds the initial_value of the induction.  */
7190   if (nested_in_vect_loop)
7191     {
7192       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
7193          been created during vectorization of previous stmts.  We obtain it
7194          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
7195       vec_init = vect_get_vec_def_for_operand (init_expr, stmt_info);
7196       /* If the initial value is not of proper type, convert it.  */
7197       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
7198         {
7199           new_stmt
7200             = gimple_build_assign (vect_get_new_ssa_name (vectype,
7201                                                           vect_simple_var,
7202                                                           "vec_iv_"),
7203                                    VIEW_CONVERT_EXPR,
7204                                    build1 (VIEW_CONVERT_EXPR, vectype,
7205                                            vec_init));
7206           vec_init = gimple_assign_lhs (new_stmt);
7207           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
7208                                                  new_stmt);
7209           gcc_assert (!new_bb);
7210           loop_vinfo->add_stmt (new_stmt);
7211         }
7212     }
7213   else
7214     {
7215       /* iv_loop is the loop to be vectorized. Create:
7216          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
7217       stmts = NULL;
7218       new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
7219
7220       unsigned HOST_WIDE_INT const_nunits;
7221       if (nunits.is_constant (&const_nunits))
7222         {
7223           tree_vector_builder elts (step_vectype, const_nunits, 1);
7224           elts.quick_push (new_name);
7225           for (i = 1; i < const_nunits; i++)
7226             {
7227               /* Create: new_name_i = new_name + step_expr  */
7228               new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
7229                                        new_name, step_expr);
7230               elts.quick_push (new_name);
7231             }
7232           /* Create a vector from [new_name_0, new_name_1, ...,
7233              new_name_nunits-1]  */
7234           vec_init = gimple_build_vector (&stmts, &elts);
7235         }
7236       else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
7237         /* Build the initial value directly from a VEC_SERIES_EXPR.  */
7238         vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
7239                                  new_name, step_expr);
7240       else
7241         {
7242           /* Build:
7243                 [base, base, base, ...]
7244                 + (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
7245           gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
7246           gcc_assert (flag_associative_math);
7247           tree index = build_index_vector (step_vectype, 0, 1);
7248           tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
7249                                                         new_name);
7250           tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
7251                                                         step_expr);
7252           vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
7253           vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
7254                                    vec_init, step_vec);
7255           vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
7256                                    vec_init, base_vec);
7257         }
7258       vec_init = gimple_convert (&stmts, vectype, vec_init);
7259
7260       if (stmts)
7261         {
7262           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7263           gcc_assert (!new_bb);
7264         }
7265     }
7266
7267
7268   /* Create the vector that holds the step of the induction.  */
7269   if (nested_in_vect_loop)
7270     /* iv_loop is nested in the loop to be vectorized. Generate:
7271        vec_step = [S, S, S, S]  */
7272     new_name = step_expr;
7273   else
7274     {
7275       /* iv_loop is the loop to be vectorized. Generate:
7276           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
7277       gimple_seq seq = NULL;
7278       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7279         {
7280           expr = build_int_cst (integer_type_node, vf);
7281           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7282         }
7283       else
7284         expr = build_int_cst (TREE_TYPE (step_expr), vf);
7285       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7286                                expr, step_expr);
7287       if (seq)
7288         {
7289           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7290           gcc_assert (!new_bb);
7291         }
7292     }
7293
7294   t = unshare_expr (new_name);
7295   gcc_assert (CONSTANT_CLASS_P (new_name)
7296               || TREE_CODE (new_name) == SSA_NAME);
7297   new_vec = build_vector_from_val (step_vectype, t);
7298   vec_step = vect_init_vector (stmt_info, new_vec, step_vectype, NULL);
7299
7300
7301   /* Create the following def-use cycle:
7302      loop prolog:
7303          vec_init = ...
7304          vec_step = ...
7305      loop:
7306          vec_iv = PHI <vec_init, vec_loop>
7307          ...
7308          STMT
7309          ...
7310          vec_loop = vec_iv + vec_step;  */
7311
7312   /* Create the induction-phi that defines the induction-operand.  */
7313   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7314   induction_phi = create_phi_node (vec_dest, iv_loop->header);
7315   stmt_vec_info induction_phi_info = loop_vinfo->add_stmt (induction_phi);
7316   induc_def = PHI_RESULT (induction_phi);
7317
7318   /* Create the iv update inside the loop  */
7319   stmts = NULL;
7320   vec_def = gimple_convert (&stmts, step_vectype, induc_def);
7321   vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
7322   vec_def = gimple_convert (&stmts, vectype, vec_def);
7323   gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
7324   new_stmt = SSA_NAME_DEF_STMT (vec_def);
7325   stmt_vec_info new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7326
7327   /* Set the arguments of the phi node:  */
7328   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7329   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7330                UNKNOWN_LOCATION);
7331
7332   STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi_info;
7333
7334   /* In case that vectorization factor (VF) is bigger than the number
7335      of elements that we can fit in a vectype (nunits), we have to generate
7336      more than one vector stmt - i.e - we need to "unroll" the
7337      vector stmt by a factor VF/nunits.  For more details see documentation
7338      in vectorizable_operation.  */
7339
7340   if (ncopies > 1)
7341     {
7342       gimple_seq seq = NULL;
7343       stmt_vec_info prev_stmt_vinfo;
7344       /* FORNOW. This restriction should be relaxed.  */
7345       gcc_assert (!nested_in_vect_loop);
7346
7347       /* Create the vector that holds the step of the induction.  */
7348       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7349         {
7350           expr = build_int_cst (integer_type_node, nunits);
7351           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7352         }
7353       else
7354         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
7355       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7356                                expr, step_expr);
7357       if (seq)
7358         {
7359           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7360           gcc_assert (!new_bb);
7361         }
7362
7363       t = unshare_expr (new_name);
7364       gcc_assert (CONSTANT_CLASS_P (new_name)
7365                   || TREE_CODE (new_name) == SSA_NAME);
7366       new_vec = build_vector_from_val (step_vectype, t);
7367       vec_step = vect_init_vector (stmt_info, new_vec, step_vectype, NULL);
7368
7369       vec_def = induc_def;
7370       prev_stmt_vinfo = induction_phi_info;
7371       for (i = 1; i < ncopies; i++)
7372         {
7373           /* vec_i = vec_prev + vec_step  */
7374           gimple_seq stmts = NULL;
7375           vec_def = gimple_convert (&stmts, step_vectype, vec_def);
7376           vec_def = gimple_build (&stmts,
7377                                   PLUS_EXPR, step_vectype, vec_def, vec_step);
7378           vec_def = gimple_convert (&stmts, vectype, vec_def);
7379
7380           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
7381           new_stmt = SSA_NAME_DEF_STMT (vec_def);
7382           new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7383           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt_info;
7384           prev_stmt_vinfo = new_stmt_info;
7385         }
7386     }
7387
7388   if (nested_in_vect_loop)
7389     {
7390       /* Find the loop-closed exit-phi of the induction, and record
7391          the final vector of induction results:  */
7392       exit_phi = NULL;
7393       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7394         {
7395           gimple *use_stmt = USE_STMT (use_p);
7396           if (is_gimple_debug (use_stmt))
7397             continue;
7398
7399           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
7400             {
7401               exit_phi = use_stmt;
7402               break;
7403             }
7404         }
7405       if (exit_phi)
7406         {
7407           stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7408           /* FORNOW. Currently not supporting the case that an inner-loop induction
7409              is not used in the outer-loop (i.e. only outside the outer-loop).  */
7410           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
7411                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
7412
7413           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt_info;
7414           if (dump_enabled_p ())
7415             dump_printf_loc (MSG_NOTE, vect_location,
7416                              "vector of inductions after inner-loop:%G",
7417                              new_stmt);
7418         }
7419     }
7420
7421
7422   if (dump_enabled_p ())
7423     dump_printf_loc (MSG_NOTE, vect_location,
7424                      "transform induction: created def-use cycle: %G%G",
7425                      induction_phi, SSA_NAME_DEF_STMT (vec_def));
7426
7427   return true;
7428 }
7429
7430 /* Function vectorizable_live_operation.
7431
7432    STMT_INFO computes a value that is used outside the loop.  Check if
7433    it can be supported.  */
7434
7435 bool
7436 vectorizable_live_operation (stmt_vec_info stmt_info,
7437                              gimple_stmt_iterator *gsi,
7438                              slp_tree slp_node, slp_instance slp_node_instance,
7439                              int slp_index, bool vec_stmt_p,
7440                              stmt_vector_for_cost *)
7441 {
7442   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7443   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7444   imm_use_iterator imm_iter;
7445   tree lhs, lhs_type, bitsize, vec_bitsize;
7446   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7447   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7448   int ncopies;
7449   gimple *use_stmt;
7450   auto_vec<tree> vec_oprnds;
7451   int vec_entry = 0;
7452   poly_uint64 vec_index = 0;
7453
7454   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
7455
7456   /* The last stmt of a reduction is live and vectorized via
7457      vect_create_epilog_for_reduction.  vectorizable_reduction assessed
7458      validity so just trigger the transform here.  */
7459   if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
7460     {
7461       if (!vec_stmt_p)
7462         return true;
7463       if (slp_node)
7464         {
7465           /* For reduction chains the meta-info is attached to
7466              the group leader.  */
7467           if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7468             stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
7469           /* For SLP reductions we vectorize the epilogue for
7470              all involved stmts together.  */
7471           else if (slp_index != 0)
7472             return true;
7473         }
7474       stmt_vec_info reduc_info = info_for_reduction (stmt_info);
7475       gcc_assert (reduc_info->is_reduc_info);
7476       if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
7477           || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
7478         return true;
7479       vect_create_epilog_for_reduction (stmt_info, slp_node,
7480                                         slp_node_instance);
7481       return true;
7482     }
7483
7484   /* FORNOW.  CHECKME.  */
7485   if (nested_in_vect_loop_p (loop, stmt_info))
7486     return false;
7487
7488   /* If STMT is not relevant and it is a simple assignment and its inputs are
7489      invariant then it can remain in place, unvectorized.  The original last
7490      scalar value that it computes will be used.  */
7491   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7492     {
7493       gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
7494       if (dump_enabled_p ())
7495         dump_printf_loc (MSG_NOTE, vect_location,
7496                          "statement is simple and uses invariant.  Leaving in "
7497                          "place.\n");
7498       return true;
7499     }
7500
7501   if (slp_node)
7502     ncopies = 1;
7503   else
7504     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7505
7506   if (slp_node)
7507     {
7508       gcc_assert (slp_index >= 0);
7509
7510       int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7511       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7512
7513       /* Get the last occurrence of the scalar index from the concatenation of
7514          all the slp vectors. Calculate which slp vector it is and the index
7515          within.  */
7516       poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
7517
7518       /* Calculate which vector contains the result, and which lane of
7519          that vector we need.  */
7520       if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
7521         {
7522           if (dump_enabled_p ())
7523             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7524                              "Cannot determine which vector holds the"
7525                              " final result.\n");
7526           return false;
7527         }
7528     }
7529
7530   if (!vec_stmt_p)
7531     {
7532       /* No transformation required.  */
7533       if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
7534         {
7535           if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
7536                                                OPTIMIZE_FOR_SPEED))
7537             {
7538               if (dump_enabled_p ())
7539                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7540                                  "can't use a fully-masked loop because "
7541                                  "the target doesn't support extract last "
7542                                  "reduction.\n");
7543               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7544             }
7545           else if (slp_node)
7546             {
7547               if (dump_enabled_p ())
7548                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7549                                  "can't use a fully-masked loop because an "
7550                                  "SLP statement is live after the loop.\n");
7551               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7552             }
7553           else if (ncopies > 1)
7554             {
7555               if (dump_enabled_p ())
7556                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7557                                  "can't use a fully-masked loop because"
7558                                  " ncopies is greater than 1.\n");
7559               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7560             }
7561           else
7562             {
7563               gcc_assert (ncopies == 1 && !slp_node);
7564               vect_record_loop_mask (loop_vinfo,
7565                                      &LOOP_VINFO_MASKS (loop_vinfo),
7566                                      1, vectype, NULL);
7567             }
7568         }
7569       return true;
7570     }
7571
7572   /* Use the lhs of the original scalar statement.  */
7573   gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
7574
7575   lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
7576         : gimple_get_lhs (stmt);
7577   lhs_type = TREE_TYPE (lhs);
7578
7579   bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype)
7580              ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype)))
7581              : TYPE_SIZE (TREE_TYPE (vectype)));
7582   vec_bitsize = TYPE_SIZE (vectype);
7583
7584   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
7585   tree vec_lhs, bitstart;
7586   if (slp_node)
7587     {
7588       gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
7589
7590       /* Get the correct slp vectorized stmt.  */
7591       gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry]->stmt;
7592       if (gphi *phi = dyn_cast <gphi *> (vec_stmt))
7593         vec_lhs = gimple_phi_result (phi);
7594       else
7595         vec_lhs = gimple_get_lhs (vec_stmt);
7596
7597       /* Get entry to use.  */
7598       bitstart = bitsize_int (vec_index);
7599       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
7600     }
7601   else
7602     {
7603       enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
7604       vec_lhs = vect_get_vec_def_for_operand_1 (stmt_info, dt);
7605       gcc_checking_assert (ncopies == 1
7606                            || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
7607
7608       /* For multiple copies, get the last copy.  */
7609       for (int i = 1; i < ncopies; ++i)
7610         vec_lhs = vect_get_vec_def_for_stmt_copy (loop_vinfo, vec_lhs);
7611
7612       /* Get the last lane in the vector.  */
7613       bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
7614     }
7615
7616   gimple_seq stmts = NULL;
7617   tree new_tree;
7618   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7619     {
7620       /* Emit:
7621
7622            SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
7623
7624          where VEC_LHS is the vectorized live-out result and MASK is
7625          the loop mask for the final iteration.  */
7626       gcc_assert (ncopies == 1 && !slp_node);
7627       tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
7628       tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
7629                                       1, vectype, 0);
7630       tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST,
7631                                       scalar_type, mask, vec_lhs);
7632
7633       /* Convert the extracted vector element to the required scalar type.  */
7634       new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
7635     }
7636   else
7637     {
7638       tree bftype = TREE_TYPE (vectype);
7639       if (VECTOR_BOOLEAN_TYPE_P (vectype))
7640         bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
7641       new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs, bitsize, bitstart);
7642       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
7643                                        &stmts, true, NULL_TREE);
7644     }
7645
7646   if (stmts)
7647     gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts);
7648
7649   /* Replace use of lhs with newly computed result.  If the use stmt is a
7650      single arg PHI, just replace all uses of PHI result.  It's necessary
7651      because lcssa PHI defining lhs may be before newly inserted stmt.  */
7652   use_operand_p use_p;
7653   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
7654     if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
7655         && !is_gimple_debug (use_stmt))
7656     {
7657       if (gimple_code (use_stmt) == GIMPLE_PHI
7658           && gimple_phi_num_args (use_stmt) == 1)
7659         {
7660           replace_uses_by (gimple_phi_result (use_stmt), new_tree);
7661         }
7662       else
7663         {
7664           FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
7665             SET_USE (use_p, new_tree);
7666         }
7667       update_stmt (use_stmt);
7668     }
7669
7670   return true;
7671 }
7672
7673 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO.  */
7674
7675 static void
7676 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
7677 {
7678   ssa_op_iter op_iter;
7679   imm_use_iterator imm_iter;
7680   def_operand_p def_p;
7681   gimple *ustmt;
7682
7683   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
7684     {
7685       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
7686         {
7687           basic_block bb;
7688
7689           if (!is_gimple_debug (ustmt))
7690             continue;
7691
7692           bb = gimple_bb (ustmt);
7693
7694           if (!flow_bb_inside_loop_p (loop, bb))
7695             {
7696               if (gimple_debug_bind_p (ustmt))
7697                 {
7698                   if (dump_enabled_p ())
7699                     dump_printf_loc (MSG_NOTE, vect_location,
7700                                      "killing debug use\n");
7701
7702                   gimple_debug_bind_reset_value (ustmt);
7703                   update_stmt (ustmt);
7704                 }
7705               else
7706                 gcc_unreachable ();
7707             }
7708         }
7709     }
7710 }
7711
7712 /* Given loop represented by LOOP_VINFO, return true if computation of
7713    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
7714    otherwise.  */
7715
7716 static bool
7717 loop_niters_no_overflow (loop_vec_info loop_vinfo)
7718 {
7719   /* Constant case.  */
7720   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
7721     {
7722       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
7723       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
7724
7725       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
7726       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
7727       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
7728         return true;
7729     }
7730
7731   widest_int max;
7732   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7733   /* Check the upper bound of loop niters.  */
7734   if (get_max_loop_iterations (loop, &max))
7735     {
7736       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
7737       signop sgn = TYPE_SIGN (type);
7738       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
7739       if (max < type_max)
7740         return true;
7741     }
7742   return false;
7743 }
7744
7745 /* Return a mask type with half the number of elements as TYPE.  */
7746
7747 tree
7748 vect_halve_mask_nunits (vec_info *vinfo, tree type)
7749 {
7750   poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (type), 2);
7751   return build_truth_vector_type (nunits, vinfo->vector_size);
7752 }
7753
7754 /* Return a mask type with twice as many elements as TYPE.  */
7755
7756 tree
7757 vect_double_mask_nunits (vec_info *vinfo, tree type)
7758 {
7759   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (type) * 2;
7760   return build_truth_vector_type (nunits, vinfo->vector_size);
7761 }
7762
7763 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
7764    contain a sequence of NVECTORS masks that each control a vector of type
7765    VECTYPE.  If SCALAR_MASK is nonnull, the fully-masked loop would AND
7766    these vector masks with the vector version of SCALAR_MASK.  */
7767
7768 void
7769 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
7770                        unsigned int nvectors, tree vectype, tree scalar_mask)
7771 {
7772   gcc_assert (nvectors != 0);
7773   if (masks->length () < nvectors)
7774     masks->safe_grow_cleared (nvectors);
7775   rgroup_masks *rgm = &(*masks)[nvectors - 1];
7776   /* The number of scalars per iteration and the number of vectors are
7777      both compile-time constants.  */
7778   unsigned int nscalars_per_iter
7779     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
7780                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
7781
7782   if (scalar_mask)
7783     {
7784       scalar_cond_masked_key cond (scalar_mask, nvectors);
7785       loop_vinfo->scalar_cond_masked_set.add (cond);
7786     }
7787
7788   if (rgm->max_nscalars_per_iter < nscalars_per_iter)
7789     {
7790       rgm->max_nscalars_per_iter = nscalars_per_iter;
7791       rgm->mask_type = build_same_sized_truth_vector_type (vectype);
7792     }
7793 }
7794
7795 /* Given a complete set of masks MASKS, extract mask number INDEX
7796    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
7797    where 0 <= INDEX < NVECTORS.  Insert any set-up statements before GSI.
7798
7799    See the comment above vec_loop_masks for more details about the mask
7800    arrangement.  */
7801
7802 tree
7803 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
7804                     unsigned int nvectors, tree vectype, unsigned int index)
7805 {
7806   rgroup_masks *rgm = &(*masks)[nvectors - 1];
7807   tree mask_type = rgm->mask_type;
7808
7809   /* Populate the rgroup's mask array, if this is the first time we've
7810      used it.  */
7811   if (rgm->masks.is_empty ())
7812     {
7813       rgm->masks.safe_grow_cleared (nvectors);
7814       for (unsigned int i = 0; i < nvectors; ++i)
7815         {
7816           tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
7817           /* Provide a dummy definition until the real one is available.  */
7818           SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
7819           rgm->masks[i] = mask;
7820         }
7821     }
7822
7823   tree mask = rgm->masks[index];
7824   if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
7825                 TYPE_VECTOR_SUBPARTS (vectype)))
7826     {
7827       /* A loop mask for data type X can be reused for data type Y
7828          if X has N times more elements than Y and if Y's elements
7829          are N times bigger than X's.  In this case each sequence
7830          of N elements in the loop mask will be all-zero or all-one.
7831          We can then view-convert the mask so that each sequence of
7832          N elements is replaced by a single element.  */
7833       gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
7834                               TYPE_VECTOR_SUBPARTS (vectype)));
7835       gimple_seq seq = NULL;
7836       mask_type = build_same_sized_truth_vector_type (vectype);
7837       mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
7838       if (seq)
7839         gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
7840     }
7841   return mask;
7842 }
7843
7844 /* Scale profiling counters by estimation for LOOP which is vectorized
7845    by factor VF.  */
7846
7847 static void
7848 scale_profile_for_vect_loop (class loop *loop, unsigned vf)
7849 {
7850   edge preheader = loop_preheader_edge (loop);
7851   /* Reduce loop iterations by the vectorization factor.  */
7852   gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
7853   profile_count freq_h = loop->header->count, freq_e = preheader->count ();
7854
7855   if (freq_h.nonzero_p ())
7856     {
7857       profile_probability p;
7858
7859       /* Avoid dropping loop body profile counter to 0 because of zero count
7860          in loop's preheader.  */
7861       if (!(freq_e == profile_count::zero ()))
7862         freq_e = freq_e.force_nonzero ();
7863       p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
7864       scale_loop_frequencies (loop, p);
7865     }
7866
7867   edge exit_e = single_exit (loop);
7868   exit_e->probability = profile_probability::always ()
7869                                  .apply_scale (1, new_est_niter + 1);
7870
7871   edge exit_l = single_pred_edge (loop->latch);
7872   profile_probability prob = exit_l->probability;
7873   exit_l->probability = exit_e->probability.invert ();
7874   if (prob.initialized_p () && exit_l->probability.initialized_p ())
7875     scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
7876 }
7877
7878 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
7879    When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
7880    stmt_vec_info.  */
7881
7882 static void
7883 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
7884                           gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
7885 {
7886   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7887   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7888
7889   if (dump_enabled_p ())
7890     dump_printf_loc (MSG_NOTE, vect_location,
7891                      "------>vectorizing statement: %G", stmt_info->stmt);
7892
7893   if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
7894     vect_loop_kill_debug_uses (loop, stmt_info);
7895
7896   if (!STMT_VINFO_RELEVANT_P (stmt_info)
7897       && !STMT_VINFO_LIVE_P (stmt_info))
7898     return;
7899
7900   if (STMT_VINFO_VECTYPE (stmt_info))
7901     {
7902       poly_uint64 nunits
7903         = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
7904       if (!STMT_SLP_TYPE (stmt_info)
7905           && maybe_ne (nunits, vf)
7906           && dump_enabled_p ())
7907         /* For SLP VF is set according to unrolling factor, and not
7908            to vector size, hence for SLP this print is not valid.  */
7909         dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
7910     }
7911
7912   /* Pure SLP statements have already been vectorized.  We still need
7913      to apply loop vectorization to hybrid SLP statements.  */
7914   if (PURE_SLP_STMT (stmt_info))
7915     return;
7916
7917   if (dump_enabled_p ())
7918     dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
7919
7920   if (vect_transform_stmt (stmt_info, gsi, NULL, NULL))
7921     *seen_store = stmt_info;
7922 }
7923
7924 /* Function vect_transform_loop.
7925
7926    The analysis phase has determined that the loop is vectorizable.
7927    Vectorize the loop - created vectorized stmts to replace the scalar
7928    stmts in the loop, and update the loop exit condition.
7929    Returns scalar epilogue loop if any.  */
7930
7931 class loop *
7932 vect_transform_loop (loop_vec_info loop_vinfo)
7933 {
7934   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7935   class loop *epilogue = NULL;
7936   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
7937   int nbbs = loop->num_nodes;
7938   int i;
7939   tree niters_vector = NULL_TREE;
7940   tree step_vector = NULL_TREE;
7941   tree niters_vector_mult_vf = NULL_TREE;
7942   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7943   unsigned int lowest_vf = constant_lower_bound (vf);
7944   gimple *stmt;
7945   bool check_profitability = false;
7946   unsigned int th;
7947
7948   DUMP_VECT_SCOPE ("vec_transform_loop");
7949
7950   loop_vinfo->shared->check_datarefs ();
7951
7952   /* Use the more conservative vectorization threshold.  If the number
7953      of iterations is constant assume the cost check has been performed
7954      by our caller.  If the threshold makes all loops profitable that
7955      run at least the (estimated) vectorization factor number of times
7956      checking is pointless, too.  */
7957   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
7958   if (th >= vect_vf_for_cost (loop_vinfo)
7959       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
7960     {
7961       if (dump_enabled_p ())
7962         dump_printf_loc (MSG_NOTE, vect_location,
7963                          "Profitability threshold is %d loop iterations.\n",
7964                          th);
7965       check_profitability = true;
7966     }
7967
7968   /* Make sure there exists a single-predecessor exit bb.  Do this before
7969      versioning.   */
7970   edge e = single_exit (loop);
7971   if (! single_pred_p (e->dest))
7972     {
7973       split_loop_exit_edge (e, true);
7974       if (dump_enabled_p ())
7975         dump_printf (MSG_NOTE, "split exit edge\n");
7976     }
7977
7978   /* Version the loop first, if required, so the profitability check
7979      comes first.  */
7980
7981   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
7982     {
7983       class loop *sloop
7984         = vect_loop_versioning (loop_vinfo);
7985       sloop->force_vectorize = false;
7986       check_profitability = false;
7987     }
7988
7989   /* Make sure there exists a single-predecessor exit bb also on the
7990      scalar loop copy.  Do this after versioning but before peeling
7991      so CFG structure is fine for both scalar and if-converted loop
7992      to make slpeel_duplicate_current_defs_from_edges face matched
7993      loop closed PHI nodes on the exit.  */
7994   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
7995     {
7996       e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
7997       if (! single_pred_p (e->dest))
7998         {
7999           split_loop_exit_edge (e, true);
8000           if (dump_enabled_p ())
8001             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
8002         }
8003     }
8004
8005   tree niters = vect_build_loop_niters (loop_vinfo);
8006   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
8007   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
8008   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
8009   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
8010                               &step_vector, &niters_vector_mult_vf, th,
8011                               check_profitability, niters_no_overflow);
8012   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
8013       && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
8014     scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
8015                             LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
8016
8017   if (niters_vector == NULL_TREE)
8018     {
8019       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8020           && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8021           && known_eq (lowest_vf, vf))
8022         {
8023           niters_vector
8024             = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
8025                              LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
8026           step_vector = build_one_cst (TREE_TYPE (niters));
8027         }
8028       else
8029         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
8030                                      &step_vector, niters_no_overflow);
8031     }
8032
8033   /* 1) Make sure the loop header has exactly two entries
8034      2) Make sure we have a preheader basic block.  */
8035
8036   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
8037
8038   split_edge (loop_preheader_edge (loop));
8039
8040   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8041       && vect_use_loop_mask_for_alignment_p (loop_vinfo))
8042     /* This will deal with any possible peeling.  */
8043     vect_prepare_for_masked_peels (loop_vinfo);
8044
8045   /* Schedule the SLP instances first, then handle loop vectorization
8046      below.  */
8047   if (!loop_vinfo->slp_instances.is_empty ())
8048     {
8049       DUMP_VECT_SCOPE ("scheduling SLP instances");
8050       vect_schedule_slp (loop_vinfo);
8051     }
8052
8053   /* FORNOW: the vectorizer supports only loops which body consist
8054      of one basic block (header + empty latch). When the vectorizer will
8055      support more involved loop forms, the order by which the BBs are
8056      traversed need to be reconsidered.  */
8057
8058   for (i = 0; i < nbbs; i++)
8059     {
8060       basic_block bb = bbs[i];
8061       stmt_vec_info stmt_info;
8062
8063       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
8064            gsi_next (&si))
8065         {
8066           gphi *phi = si.phi ();
8067           if (dump_enabled_p ())
8068             dump_printf_loc (MSG_NOTE, vect_location,
8069                              "------>vectorizing phi: %G", phi);
8070           stmt_info = loop_vinfo->lookup_stmt (phi);
8071           if (!stmt_info)
8072             continue;
8073
8074           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8075             vect_loop_kill_debug_uses (loop, stmt_info);
8076
8077           if (!STMT_VINFO_RELEVANT_P (stmt_info)
8078               && !STMT_VINFO_LIVE_P (stmt_info))
8079             continue;
8080
8081           if (STMT_VINFO_VECTYPE (stmt_info)
8082               && (maybe_ne
8083                   (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
8084               && dump_enabled_p ())
8085             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8086
8087           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
8088                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8089                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
8090                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
8091                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
8092               && ! PURE_SLP_STMT (stmt_info))
8093             {
8094               if (dump_enabled_p ())
8095                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
8096               vect_transform_stmt (stmt_info, NULL, NULL, NULL);
8097             }
8098         }
8099
8100       for (gimple_stmt_iterator si = gsi_start_bb (bb);
8101            !gsi_end_p (si);)
8102         {
8103           stmt = gsi_stmt (si);
8104           /* During vectorization remove existing clobber stmts.  */
8105           if (gimple_clobber_p (stmt))
8106             {
8107               unlink_stmt_vdef (stmt);
8108               gsi_remove (&si, true);
8109               release_defs (stmt);
8110             }
8111           else
8112             {
8113               stmt_info = loop_vinfo->lookup_stmt (stmt);
8114
8115               /* vector stmts created in the outer-loop during vectorization of
8116                  stmts in an inner-loop may not have a stmt_info, and do not
8117                  need to be vectorized.  */
8118               stmt_vec_info seen_store = NULL;
8119               if (stmt_info)
8120                 {
8121                   if (STMT_VINFO_IN_PATTERN_P (stmt_info))
8122                     {
8123                       gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
8124                       for (gimple_stmt_iterator subsi = gsi_start (def_seq);
8125                            !gsi_end_p (subsi); gsi_next (&subsi))
8126                         {
8127                           stmt_vec_info pat_stmt_info
8128                             = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
8129                           vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
8130                                                     &si, &seen_store);
8131                         }
8132                       stmt_vec_info pat_stmt_info
8133                         = STMT_VINFO_RELATED_STMT (stmt_info);
8134                       vect_transform_loop_stmt (loop_vinfo, pat_stmt_info, &si,
8135                                                 &seen_store);
8136                     }
8137                   vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
8138                                             &seen_store);
8139                 }
8140               gsi_next (&si);
8141               if (seen_store)
8142                 {
8143                   if (STMT_VINFO_GROUPED_ACCESS (seen_store))
8144                     /* Interleaving.  If IS_STORE is TRUE, the
8145                        vectorization of the interleaving chain was
8146                        completed - free all the stores in the chain.  */
8147                     vect_remove_stores (DR_GROUP_FIRST_ELEMENT (seen_store));
8148                   else
8149                     /* Free the attached stmt_vec_info and remove the stmt.  */
8150                     loop_vinfo->remove_stmt (stmt_info);
8151                 }
8152             }
8153         }
8154
8155       /* Stub out scalar statements that must not survive vectorization.
8156          Doing this here helps with grouped statements, or statements that
8157          are involved in patterns.  */
8158       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
8159            !gsi_end_p (gsi); gsi_next (&gsi))
8160         {
8161           gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
8162           if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
8163             {
8164               tree lhs = gimple_get_lhs (call);
8165               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8166                 {
8167                   tree zero = build_zero_cst (TREE_TYPE (lhs));
8168                   gimple *new_stmt = gimple_build_assign (lhs, zero);
8169                   gsi_replace (&gsi, new_stmt, true);
8170                 }
8171             }
8172         }
8173     }                           /* BBs in loop */
8174
8175   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
8176      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
8177   if (integer_onep (step_vector))
8178     niters_no_overflow = true;
8179   vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
8180                            niters_vector_mult_vf, !niters_no_overflow);
8181
8182   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
8183   scale_profile_for_vect_loop (loop, assumed_vf);
8184
8185   /* True if the final iteration might not handle a full vector's
8186      worth of scalar iterations.  */
8187   bool final_iter_may_be_partial = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8188   /* The minimum number of iterations performed by the epilogue.  This
8189      is 1 when peeling for gaps because we always need a final scalar
8190      iteration.  */
8191   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
8192   /* +1 to convert latch counts to loop iteration counts,
8193      -min_epilogue_iters to remove iterations that cannot be performed
8194        by the vector code.  */
8195   int bias_for_lowest = 1 - min_epilogue_iters;
8196   int bias_for_assumed = bias_for_lowest;
8197   int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
8198   if (alignment_npeels && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8199     {
8200       /* When the amount of peeling is known at compile time, the first
8201          iteration will have exactly alignment_npeels active elements.
8202          In the worst case it will have at least one.  */
8203       int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
8204       bias_for_lowest += lowest_vf - min_first_active;
8205       bias_for_assumed += assumed_vf - min_first_active;
8206     }
8207   /* In these calculations the "- 1" converts loop iteration counts
8208      back to latch counts.  */
8209   if (loop->any_upper_bound)
8210     loop->nb_iterations_upper_bound
8211       = (final_iter_may_be_partial
8212          ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
8213                           lowest_vf) - 1
8214          : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
8215                            lowest_vf) - 1);
8216   if (loop->any_likely_upper_bound)
8217     loop->nb_iterations_likely_upper_bound
8218       = (final_iter_may_be_partial
8219          ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
8220                           + bias_for_lowest, lowest_vf) - 1
8221          : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
8222                            + bias_for_lowest, lowest_vf) - 1);
8223   if (loop->any_estimate)
8224     loop->nb_iterations_estimate
8225       = (final_iter_may_be_partial
8226          ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
8227                           assumed_vf) - 1
8228          : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
8229                            assumed_vf) - 1);
8230
8231   if (dump_enabled_p ())
8232     {
8233       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8234         {
8235           dump_printf_loc (MSG_NOTE, vect_location,
8236                            "LOOP VECTORIZED\n");
8237           if (loop->inner)
8238             dump_printf_loc (MSG_NOTE, vect_location,
8239                              "OUTER LOOP VECTORIZED\n");
8240           dump_printf (MSG_NOTE, "\n");
8241         }
8242       else
8243         {
8244           dump_printf_loc (MSG_NOTE, vect_location,
8245                            "LOOP EPILOGUE VECTORIZED (VS=");
8246           dump_dec (MSG_NOTE, loop_vinfo->vector_size);
8247           dump_printf (MSG_NOTE, ")\n");
8248         }
8249     }
8250
8251   /* Loops vectorized with a variable factor won't benefit from
8252      unrolling/peeling.  */
8253   if (!vf.is_constant ())
8254     {
8255       loop->unroll = 1;
8256       if (dump_enabled_p ())
8257         dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
8258                          " variable-length vectorization factor\n");
8259     }
8260   /* Free SLP instances here because otherwise stmt reference counting
8261      won't work.  */
8262   slp_instance instance;
8263   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
8264     vect_free_slp_instance (instance, true);
8265   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
8266   /* Clear-up safelen field since its value is invalid after vectorization
8267      since vectorized loop can have loop-carried dependencies.  */
8268   loop->safelen = 0;
8269
8270   /* Don't vectorize epilogue for epilogue.  */
8271   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8272     epilogue = NULL;
8273
8274   if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
8275     epilogue = NULL;
8276
8277   if (epilogue)
8278     {
8279       auto_vector_sizes vector_sizes;
8280       targetm.vectorize.autovectorize_vector_sizes (&vector_sizes, false);
8281       unsigned int next_size = 0;
8282
8283       /* Note LOOP_VINFO_NITERS_KNOWN_P and LOOP_VINFO_INT_NITERS work
8284          on niters already ajusted for the iterations of the prologue.  */
8285       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8286           && known_eq (vf, lowest_vf))
8287         {
8288           unsigned HOST_WIDE_INT eiters
8289             = (LOOP_VINFO_INT_NITERS (loop_vinfo)
8290                - LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
8291           eiters
8292             = eiters % lowest_vf + LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
8293           epilogue->nb_iterations_upper_bound = eiters - 1;
8294           epilogue->any_upper_bound = true;
8295
8296           unsigned int ratio;
8297           while (next_size < vector_sizes.length ()
8298                  && !(constant_multiple_p (loop_vinfo->vector_size,
8299                                            vector_sizes[next_size], &ratio)
8300                       && eiters >= lowest_vf / ratio))
8301             next_size += 1;
8302         }
8303       else
8304         while (next_size < vector_sizes.length ()
8305                && maybe_lt (loop_vinfo->vector_size, vector_sizes[next_size]))
8306           next_size += 1;
8307
8308       if (next_size == vector_sizes.length ())
8309         epilogue = NULL;
8310     }
8311
8312   if (epilogue)
8313     {
8314       epilogue->force_vectorize = loop->force_vectorize;
8315       epilogue->safelen = loop->safelen;
8316       epilogue->dont_vectorize = false;
8317
8318       /* We may need to if-convert epilogue to vectorize it.  */
8319       if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8320         tree_if_conversion (epilogue);
8321     }
8322
8323   return epilogue;
8324 }
8325
8326 /* The code below is trying to perform simple optimization - revert
8327    if-conversion for masked stores, i.e. if the mask of a store is zero
8328    do not perform it and all stored value producers also if possible.
8329    For example,
8330      for (i=0; i<n; i++)
8331        if (c[i])
8332         {
8333           p1[i] += 1;
8334           p2[i] = p3[i] +2;
8335         }
8336    this transformation will produce the following semi-hammock:
8337
8338    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
8339      {
8340        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
8341        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
8342        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
8343        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
8344        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
8345        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
8346      }
8347 */
8348
8349 void
8350 optimize_mask_stores (class loop *loop)
8351 {
8352   basic_block *bbs = get_loop_body (loop);
8353   unsigned nbbs = loop->num_nodes;
8354   unsigned i;
8355   basic_block bb;
8356   class loop *bb_loop;
8357   gimple_stmt_iterator gsi;
8358   gimple *stmt;
8359   auto_vec<gimple *> worklist;
8360   auto_purge_vect_location sentinel;
8361
8362   vect_location = find_loop_location (loop);
8363   /* Pick up all masked stores in loop if any.  */
8364   for (i = 0; i < nbbs; i++)
8365     {
8366       bb = bbs[i];
8367       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
8368            gsi_next (&gsi))
8369         {
8370           stmt = gsi_stmt (gsi);
8371           if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
8372             worklist.safe_push (stmt);
8373         }
8374     }
8375
8376   free (bbs);
8377   if (worklist.is_empty ())
8378     return;
8379
8380   /* Loop has masked stores.  */
8381   while (!worklist.is_empty ())
8382     {
8383       gimple *last, *last_store;
8384       edge e, efalse;
8385       tree mask;
8386       basic_block store_bb, join_bb;
8387       gimple_stmt_iterator gsi_to;
8388       tree vdef, new_vdef;
8389       gphi *phi;
8390       tree vectype;
8391       tree zero;
8392
8393       last = worklist.pop ();
8394       mask = gimple_call_arg (last, 2);
8395       bb = gimple_bb (last);
8396       /* Create then_bb and if-then structure in CFG, then_bb belongs to
8397          the same loop as if_bb.  It could be different to LOOP when two
8398          level loop-nest is vectorized and mask_store belongs to the inner
8399          one.  */
8400       e = split_block (bb, last);
8401       bb_loop = bb->loop_father;
8402       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
8403       join_bb = e->dest;
8404       store_bb = create_empty_bb (bb);
8405       add_bb_to_loop (store_bb, bb_loop);
8406       e->flags = EDGE_TRUE_VALUE;
8407       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
8408       /* Put STORE_BB to likely part.  */
8409       efalse->probability = profile_probability::unlikely ();
8410       store_bb->count = efalse->count ();
8411       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
8412       if (dom_info_available_p (CDI_DOMINATORS))
8413         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
8414       if (dump_enabled_p ())
8415         dump_printf_loc (MSG_NOTE, vect_location,
8416                          "Create new block %d to sink mask stores.",
8417                          store_bb->index);
8418       /* Create vector comparison with boolean result.  */
8419       vectype = TREE_TYPE (mask);
8420       zero = build_zero_cst (vectype);
8421       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
8422       gsi = gsi_last_bb (bb);
8423       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
8424       /* Create new PHI node for vdef of the last masked store:
8425          .MEM_2 = VDEF <.MEM_1>
8426          will be converted to
8427          .MEM.3 = VDEF <.MEM_1>
8428          and new PHI node will be created in join bb
8429          .MEM_2 = PHI <.MEM_1, .MEM_3>
8430       */
8431       vdef = gimple_vdef (last);
8432       new_vdef = make_ssa_name (gimple_vop (cfun), last);
8433       gimple_set_vdef (last, new_vdef);
8434       phi = create_phi_node (vdef, join_bb);
8435       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
8436
8437       /* Put all masked stores with the same mask to STORE_BB if possible.  */
8438       while (true)
8439         {
8440           gimple_stmt_iterator gsi_from;
8441           gimple *stmt1 = NULL;
8442
8443           /* Move masked store to STORE_BB.  */
8444           last_store = last;
8445           gsi = gsi_for_stmt (last);
8446           gsi_from = gsi;
8447           /* Shift GSI to the previous stmt for further traversal.  */
8448           gsi_prev (&gsi);
8449           gsi_to = gsi_start_bb (store_bb);
8450           gsi_move_before (&gsi_from, &gsi_to);
8451           /* Setup GSI_TO to the non-empty block start.  */
8452           gsi_to = gsi_start_bb (store_bb);
8453           if (dump_enabled_p ())
8454             dump_printf_loc (MSG_NOTE, vect_location,
8455                              "Move stmt to created bb\n%G", last);
8456           /* Move all stored value producers if possible.  */
8457           while (!gsi_end_p (gsi))
8458             {
8459               tree lhs;
8460               imm_use_iterator imm_iter;
8461               use_operand_p use_p;
8462               bool res;
8463
8464               /* Skip debug statements.  */
8465               if (is_gimple_debug (gsi_stmt (gsi)))
8466                 {
8467                   gsi_prev (&gsi);
8468                   continue;
8469                 }
8470               stmt1 = gsi_stmt (gsi);
8471               /* Do not consider statements writing to memory or having
8472                  volatile operand.  */
8473               if (gimple_vdef (stmt1)
8474                   || gimple_has_volatile_ops (stmt1))
8475                 break;
8476               gsi_from = gsi;
8477               gsi_prev (&gsi);
8478               lhs = gimple_get_lhs (stmt1);
8479               if (!lhs)
8480                 break;
8481
8482               /* LHS of vectorized stmt must be SSA_NAME.  */
8483               if (TREE_CODE (lhs) != SSA_NAME)
8484                 break;
8485
8486               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8487                 {
8488                   /* Remove dead scalar statement.  */
8489                   if (has_zero_uses (lhs))
8490                     {
8491                       gsi_remove (&gsi_from, true);
8492                       continue;
8493                     }
8494                 }
8495
8496               /* Check that LHS does not have uses outside of STORE_BB.  */
8497               res = true;
8498               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
8499                 {
8500                   gimple *use_stmt;
8501                   use_stmt = USE_STMT (use_p);
8502                   if (is_gimple_debug (use_stmt))
8503                     continue;
8504                   if (gimple_bb (use_stmt) != store_bb)
8505                     {
8506                       res = false;
8507                       break;
8508                     }
8509                 }
8510               if (!res)
8511                 break;
8512
8513               if (gimple_vuse (stmt1)
8514                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
8515                 break;
8516
8517               /* Can move STMT1 to STORE_BB.  */
8518               if (dump_enabled_p ())
8519                 dump_printf_loc (MSG_NOTE, vect_location,
8520                                  "Move stmt to created bb\n%G", stmt1);
8521               gsi_move_before (&gsi_from, &gsi_to);
8522               /* Shift GSI_TO for further insertion.  */
8523               gsi_prev (&gsi_to);
8524             }
8525           /* Put other masked stores with the same mask to STORE_BB.  */
8526           if (worklist.is_empty ()
8527               || gimple_call_arg (worklist.last (), 2) != mask
8528               || worklist.last () != stmt1)
8529             break;
8530           last = worklist.pop ();
8531         }
8532       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
8533     }
8534 }
8535
8536 /* Decide whether it is possible to use a zero-based induction variable
8537    when vectorizing LOOP_VINFO with a fully-masked loop.  If it is,
8538    return the value that the induction variable must be able to hold
8539    in order to ensure that the loop ends with an all-false mask.
8540    Return -1 otherwise.  */
8541 widest_int
8542 vect_iv_limit_for_full_masking (loop_vec_info loop_vinfo)
8543 {
8544   tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
8545   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8546   unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
8547
8548   /* Calculate the value that the induction variable must be able
8549      to hit in order to ensure that we end the loop with an all-false mask.
8550      This involves adding the maximum number of inactive trailing scalar
8551      iterations.  */
8552   widest_int iv_limit = -1;
8553   if (max_loop_iterations (loop, &iv_limit))
8554     {
8555       if (niters_skip)
8556         {
8557           /* Add the maximum number of skipped iterations to the
8558              maximum iteration count.  */
8559           if (TREE_CODE (niters_skip) == INTEGER_CST)
8560             iv_limit += wi::to_widest (niters_skip);
8561           else
8562             iv_limit += max_vf - 1;
8563         }
8564       else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
8565         /* Make a conservatively-correct assumption.  */
8566         iv_limit += max_vf - 1;
8567
8568       /* IV_LIMIT is the maximum number of latch iterations, which is also
8569          the maximum in-range IV value.  Round this value down to the previous
8570          vector alignment boundary and then add an extra full iteration.  */
8571       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8572       iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
8573     }
8574   return iv_limit;
8575 }
8576