gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2019 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "cfghooks.h"
  31 #include "tree-pass.h"
  32 #include "ssa.h"
  33 #include "optabs-tree.h"
  34 #include "diagnostic-core.h"
  35 #include "fold-const.h"
  36 #include "stor-layout.h"
  37 #include "cfganal.h"
  38 #include "gimplify.h"
  39 #include "gimple-iterator.h"
  40 #include "gimplify-me.h"
  41 #include "tree-ssa-loop-ivopts.h"
  42 #include "tree-ssa-loop-manip.h"
  43 #include "tree-ssa-loop-niter.h"
  44 #include "tree-ssa-loop.h"
  45 #include "cfgloop.h"
  46 #include "params.h"
  47 #include "tree-scalar-evolution.h"
  48 #include "tree-vectorizer.h"
  49 #include "gimple-fold.h"
  50 #include "cgraph.h"
  51 #include "tree-cfg.h"
  52 #include "tree-if-conv.h"
  53 #include "internal-fn.h"
  54 #include "tree-vector-builder.h"
  55 #include "vec-perm-indices.h"
  56 #include "tree-eh.h"
  57
  58 /* Loop Vectorization Pass.
  59
  60    This pass tries to vectorize loops.
  61
  62    For example, the vectorizer transforms the following simple loop:
  63
  64         short a[N]; short b[N]; short c[N]; int i;
  65
  66         for (i=0; i<N; i++){
  67           a[i] = b[i] + c[i];
  68         }
  69
  70    as if it was manually vectorized by rewriting the source code into:
  71
  72         typedef int __attribute__((mode(V8HI))) v8hi;
  73         short a[N];  short b[N]; short c[N];   int i;
  74         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  75         v8hi va, vb, vc;
  76
  77         for (i=0; i<N/8; i++){
  78           vb = pb[i];
  79           vc = pc[i];
  80           va = vb + vc;
  81           pa[i] = va;
  82         }
  83
  84         The main entry to this pass is vectorize_loops(), in which
  85    the vectorizer applies a set of analyses on a given set of loops,
  86    followed by the actual vectorization transformation for the loops that
  87    had successfully passed the analysis phase.
  88         Throughout this pass we make a distinction between two types of
  89    data: scalars (which are represented by SSA_NAMES), and memory references
  90    ("data-refs").  These two types of data require different handling both
  91    during analysis and transformation. The types of data-refs that the
  92    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  93    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  94    accesses are required to have a simple (consecutive) access pattern.
  95
  96    Analysis phase:
  97    ===============
  98         The driver for the analysis phase is vect_analyze_loop().
  99    It applies a set of analyses, some of which rely on the scalar evolution
 100    analyzer (scev) developed by Sebastian Pop.
 101
 102         During the analysis phase the vectorizer records some information
 103    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 104    loop, as well as general information about the loop as a whole, which is
 105    recorded in a "loop_vec_info" struct attached to each loop.
 106
 107    Transformation phase:
 108    =====================
 109         The loop transformation phase scans all the stmts in the loop, and
 110    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 111    the loop that needs to be vectorized.  It inserts the vector code sequence
 112    just before the scalar stmt S, and records a pointer to the vector code
 113    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 114    attached to S).  This pointer will be used for the vectorization of following
 115    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 116    otherwise, we rely on dead code elimination for removing it.
 117
 118         For example, say stmt S1 was vectorized into stmt VS1:
 119
 120    VS1: vb = px[i];
 121    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 122    S2:  a = b;
 123
 124    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 125    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 126    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 127    resulting sequence would be:
 128
 129    VS1: vb = px[i];
 130    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 131    VS2: va = vb;
 132    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 133
 134         Operands that are not SSA_NAMEs, are data-refs that appear in
 135    load/store operations (like 'x[i]' in S1), and are handled differently.
 136
 137    Target modeling:
 138    =================
 139         Currently the only target specific information that is used is the
 140    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 141    Targets that can support different sizes of vectors, for now will need
 142    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 143    flexibility will be added in the future.
 144
 145         Since we only vectorize operations which vector form can be
 146    expressed using existing tree codes, to verify that an operation is
 147    supported, the vectorizer checks the relevant optab at the relevant
 148    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 149    the value found is CODE_FOR_nothing, then there's no target support, and
 150    we can't vectorize the stmt.
 151
 152    For additional information on this project see:
 153    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 154 */
 155
 156 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 157 static stmt_vec_info vect_force_simple_reduction (loop_vec_info, stmt_vec_info,
 158                                                   bool *, bool);
 159
 160 /* Subroutine of vect_determine_vf_for_stmt that handles only one
 161    statement.  VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
 162    may already be set for general statements (not just data refs).  */
 163
 164 static opt_result
 165 vect_determine_vf_for_stmt_1 (stmt_vec_info stmt_info,
 166                               bool vectype_maybe_set_p,
 167                               poly_uint64 *vf,
 168                               vec<stmt_vec_info > *mask_producers)
 169 {
 170   gimple *stmt = stmt_info->stmt;
 171
 172   if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 173        && !STMT_VINFO_LIVE_P (stmt_info))
 174       || gimple_clobber_p (stmt))
 175     {
 176       if (dump_enabled_p ())
 177         dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 178       return opt_result::success ();
 179     }
 180
 181   tree stmt_vectype, nunits_vectype;
 182   opt_result res = vect_get_vector_types_for_stmt (stmt_info, &stmt_vectype,
 183                                                    &nunits_vectype);
 184   if (!res)
 185     return res;
 186
 187   if (stmt_vectype)
 188     {
 189       if (STMT_VINFO_VECTYPE (stmt_info))
 190         /* The only case when a vectype had been already set is for stmts
 191            that contain a data ref, or for "pattern-stmts" (stmts generated
 192            by the vectorizer to represent/replace a certain idiom).  */
 193         gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
 194                      || vectype_maybe_set_p)
 195                     && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
 196       else if (stmt_vectype == boolean_type_node)
 197         mask_producers->safe_push (stmt_info);
 198       else
 199         STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
 200     }
 201
 202   if (nunits_vectype)
 203     vect_update_max_nunits (vf, nunits_vectype);
 204
 205   return opt_result::success ();
 206 }
 207
 208 /* Subroutine of vect_determine_vectorization_factor.  Set the vector
 209    types of STMT_INFO and all attached pattern statements and update
 210    the vectorization factor VF accordingly.  If some of the statements
 211    produce a mask result whose vector type can only be calculated later,
 212    add them to MASK_PRODUCERS.  Return true on success or false if
 213    something prevented vectorization.  */
 214
 215 static opt_result
 216 vect_determine_vf_for_stmt (stmt_vec_info stmt_info, poly_uint64 *vf,
 217                             vec<stmt_vec_info > *mask_producers)
 218 {
 219   vec_info *vinfo = stmt_info->vinfo;
 220   if (dump_enabled_p ())
 221     dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
 222                      stmt_info->stmt);
 223   opt_result res
 224     = vect_determine_vf_for_stmt_1 (stmt_info, false, vf, mask_producers);
 225   if (!res)
 226     return res;
 227
 228   if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 229       && STMT_VINFO_RELATED_STMT (stmt_info))
 230     {
 231       gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 232       stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
 233
 234       /* If a pattern statement has def stmts, analyze them too.  */
 235       for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
 236            !gsi_end_p (si); gsi_next (&si))
 237         {
 238           stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
 239           if (dump_enabled_p ())
 240             dump_printf_loc (MSG_NOTE, vect_location,
 241                              "==> examining pattern def stmt: %G",
 242                              def_stmt_info->stmt);
 243           if (!vect_determine_vf_for_stmt_1 (def_stmt_info, true,
 244                                              vf, mask_producers))
 245           res = vect_determine_vf_for_stmt_1 (def_stmt_info, true,
 246                                               vf, mask_producers);
 247           if (!res)
 248             return res;
 249         }
 250
 251       if (dump_enabled_p ())
 252         dump_printf_loc (MSG_NOTE, vect_location,
 253                          "==> examining pattern statement: %G",
 254                          stmt_info->stmt);
 255       res = vect_determine_vf_for_stmt_1 (stmt_info, true, vf, mask_producers);
 256       if (!res)
 257         return res;
 258     }
 259
 260   return opt_result::success ();
 261 }
 262
 263 /* Function vect_determine_vectorization_factor
 264
 265    Determine the vectorization factor (VF).  VF is the number of data elements
 266    that are operated upon in parallel in a single iteration of the vectorized
 267    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 268    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 269    elements can fit in a single vector register.
 270
 271    We currently support vectorization of loops in which all types operated upon
 272    are of the same size.  Therefore this function currently sets VF according to
 273    the size of the types operated upon, and fails if there are multiple sizes
 274    in the loop.
 275
 276    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 277    original loop:
 278         for (i=0; i<N; i++){
 279           a[i] = b[i] + c[i];
 280         }
 281
 282    vectorized loop:
 283         for (i=0; i<N; i+=VF){
 284           a[i:VF] = b[i:VF] + c[i:VF];
 285         }
 286 */
 287
 288 static opt_result
 289 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 290 {
 291   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 292   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 293   unsigned nbbs = loop->num_nodes;
 294   poly_uint64 vectorization_factor = 1;
 295   tree scalar_type = NULL_TREE;
 296   gphi *phi;
 297   tree vectype;
 298   stmt_vec_info stmt_info;
 299   unsigned i;
 300   auto_vec<stmt_vec_info> mask_producers;
 301
 302   DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
 303
 304   for (i = 0; i < nbbs; i++)
 305     {
 306       basic_block bb = bbs[i];
 307
 308       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 309            gsi_next (&si))
 310         {
 311           phi = si.phi ();
 312           stmt_info = loop_vinfo->lookup_stmt (phi);
 313           if (dump_enabled_p ())
 314             dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
 315                              phi);
 316
 317           gcc_assert (stmt_info);
 318
 319           if (STMT_VINFO_RELEVANT_P (stmt_info)
 320               || STMT_VINFO_LIVE_P (stmt_info))
 321             {
 322               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 323               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 324
 325               if (dump_enabled_p ())
 326                 dump_printf_loc (MSG_NOTE, vect_location,
 327                                  "get vectype for scalar type:  %T\n",
 328                                  scalar_type);
 329
 330               vectype = get_vectype_for_scalar_type (scalar_type);
 331               if (!vectype)
 332                 return opt_result::failure_at (phi,
 333                                                "not vectorized: unsupported "
 334                                                "data-type %T\n",
 335                                                scalar_type);
 336               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 337
 338               if (dump_enabled_p ())
 339                 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
 340                                  vectype);
 341
 342               if (dump_enabled_p ())
 343                 {
 344                   dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
 345                   dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
 346                   dump_printf (MSG_NOTE, "\n");
 347                 }
 348
 349               vect_update_max_nunits (&vectorization_factor, vectype);
 350             }
 351         }
 352
 353       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
 354            gsi_next (&si))
 355         {
 356           stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
 357           opt_result res
 358             = vect_determine_vf_for_stmt (stmt_info, &vectorization_factor,
 359                                           &mask_producers);
 360           if (!res)
 361             return res;
 362         }
 363     }
 364
 365   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 366   if (dump_enabled_p ())
 367     {
 368       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
 369       dump_dec (MSG_NOTE, vectorization_factor);
 370       dump_printf (MSG_NOTE, "\n");
 371     }
 372
 373   if (known_le (vectorization_factor, 1U))
 374     return opt_result::failure_at (vect_location,
 375                                    "not vectorized: unsupported data-type\n");
 376   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 377
 378   for (i = 0; i < mask_producers.length (); i++)
 379     {
 380       stmt_info = mask_producers[i];
 381       opt_tree mask_type = vect_get_mask_type_for_stmt (stmt_info);
 382       if (!mask_type)
 383         return opt_result::propagate_failure (mask_type);
 384       STMT_VINFO_VECTYPE (stmt_info) = mask_type;
 385     }
 386
 387   return opt_result::success ();
 388 }
 389
 390
 391 /* Function vect_is_simple_iv_evolution.
 392
 393    FORNOW: A simple evolution of an induction variables in the loop is
 394    considered a polynomial evolution.  */
 395
 396 static bool
 397 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 398                              tree * step)
 399 {
 400   tree init_expr;
 401   tree step_expr;
 402   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 403   basic_block bb;
 404
 405   /* When there is no evolution in this loop, the evolution function
 406      is not "simple".  */
 407   if (evolution_part == NULL_TREE)
 408     return false;
 409
 410   /* When the evolution is a polynomial of degree >= 2
 411      the evolution function is not "simple".  */
 412   if (tree_is_chrec (evolution_part))
 413     return false;
 414
 415   step_expr = evolution_part;
 416   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 417
 418   if (dump_enabled_p ())
 419     dump_printf_loc (MSG_NOTE, vect_location, "step: %T,  init: %T\n",
 420                      step_expr, init_expr);
 421
 422   *init = init_expr;
 423   *step = step_expr;
 424
 425   if (TREE_CODE (step_expr) != INTEGER_CST
 426       && (TREE_CODE (step_expr) != SSA_NAME
 427           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 428               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 429           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 430               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 431                   || !flag_associative_math)))
 432       && (TREE_CODE (step_expr) != REAL_CST
 433           || !flag_associative_math))
 434     {
 435       if (dump_enabled_p ())
 436         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 437                          "step unknown.\n");
 438       return false;
 439     }
 440
 441   return true;
 442 }
 443
 444 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
 445    what we are assuming is a double reduction.  For example, given
 446    a structure like this:
 447
 448       outer1:
 449         x_1 = PHI <x_4(outer2), ...>;
 450         ...
 451
 452       inner:
 453         x_2 = PHI <x_1(outer1), ...>;
 454         ...
 455         x_3 = ...;
 456         ...
 457
 458       outer2:
 459         x_4 = PHI <x_3(inner)>;
 460         ...
 461
 462    outer loop analysis would treat x_1 as a double reduction phi and
 463    this function would then return true for x_2.  */
 464
 465 static bool
 466 vect_inner_phi_in_double_reduction_p (stmt_vec_info stmt_info, gphi *phi)
 467 {
 468   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
 469   use_operand_p use_p;
 470   ssa_op_iter op_iter;
 471   FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
 472     if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
 473       if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
 474         return true;
 475   return false;
 476 }
 477
 478 /* Function vect_analyze_scalar_cycles_1.
 479
 480    Examine the cross iteration def-use cycles of scalar variables
 481    in LOOP.  LOOP_VINFO represents the loop that is now being
 482    considered for vectorization (can be LOOP, or an outer-loop
 483    enclosing LOOP).  */
 484
 485 static void
 486 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop)
 487 {
 488   basic_block bb = loop->header;
 489   tree init, step;
 490   auto_vec<stmt_vec_info, 64> worklist;
 491   gphi_iterator gsi;
 492   bool double_reduc;
 493
 494   DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
 495
 496   /* First - identify all inductions.  Reduction detection assumes that all the
 497      inductions have been identified, therefore, this order must not be
 498      changed.  */
 499   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 500     {
 501       gphi *phi = gsi.phi ();
 502       tree access_fn = NULL;
 503       tree def = PHI_RESULT (phi);
 504       stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
 505
 506       if (dump_enabled_p ())
 507         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
 508
 509       /* Skip virtual phi's.  The data dependences that are associated with
 510          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 511       if (virtual_operand_p (def))
 512         continue;
 513
 514       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 515
 516       /* Analyze the evolution function.  */
 517       access_fn = analyze_scalar_evolution (loop, def);
 518       if (access_fn)
 519         {
 520           STRIP_NOPS (access_fn);
 521           if (dump_enabled_p ())
 522             dump_printf_loc (MSG_NOTE, vect_location,
 523                              "Access function of PHI: %T\n", access_fn);
 524           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 525             = initial_condition_in_loop_num (access_fn, loop->num);
 526           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 527             = evolution_part_in_loop_num (access_fn, loop->num);
 528         }
 529
 530       if (!access_fn
 531           || vect_inner_phi_in_double_reduction_p (stmt_vinfo, phi)
 532           || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
 533           || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 534               && TREE_CODE (step) != INTEGER_CST))
 535         {
 536           worklist.safe_push (stmt_vinfo);
 537           continue;
 538         }
 539
 540       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 541                   != NULL_TREE);
 542       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 543
 544       if (dump_enabled_p ())
 545         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 546       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 547     }
 548
 549
 550   /* Second - identify all reductions and nested cycles.  */
 551   while (worklist.length () > 0)
 552     {
 553       stmt_vec_info stmt_vinfo = worklist.pop ();
 554       gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
 555       tree def = PHI_RESULT (phi);
 556
 557       if (dump_enabled_p ())
 558         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
 559
 560       gcc_assert (!virtual_operand_p (def)
 561                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 562
 563       stmt_vec_info reduc_stmt_info
 564         = vect_force_simple_reduction (loop_vinfo, stmt_vinfo,
 565                                        &double_reduc, false);
 566       if (reduc_stmt_info)
 567         {
 568           if (double_reduc)
 569             {
 570               if (dump_enabled_p ())
 571                 dump_printf_loc (MSG_NOTE, vect_location,
 572                                  "Detected double reduction.\n");
 573
 574               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 575               STMT_VINFO_DEF_TYPE (reduc_stmt_info)
 576                 = vect_double_reduction_def;
 577             }
 578           else
 579             {
 580               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
 581                 {
 582                   if (dump_enabled_p ())
 583                     dump_printf_loc (MSG_NOTE, vect_location,
 584                                      "Detected vectorizable nested cycle.\n");
 585
 586                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 587                   STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_nested_cycle;
 588                 }
 589               else
 590                 {
 591                   if (dump_enabled_p ())
 592                     dump_printf_loc (MSG_NOTE, vect_location,
 593                                      "Detected reduction.\n");
 594
 595                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 596                   STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
 597                   /* Store the reduction cycles for possible vectorization in
 598                      loop-aware SLP if it was not detected as reduction
 599                      chain.  */
 600                   if (! REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
 601                     LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
 602                       (reduc_stmt_info);
 603                 }
 604             }
 605         }
 606       else
 607         if (dump_enabled_p ())
 608           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 609                            "Unknown def-use cycle pattern.\n");
 610     }
 611 }
 612
 613
 614 /* Function vect_analyze_scalar_cycles.
 615
 616    Examine the cross iteration def-use cycles of scalar variables, by
 617    analyzing the loop-header PHIs of scalar variables.  Classify each
 618    cycle as one of the following: invariant, induction, reduction, unknown.
 619    We do that for the loop represented by LOOP_VINFO, and also to its
 620    inner-loop, if exists.
 621    Examples for scalar cycles:
 622
 623    Example1: reduction:
 624
 625               loop1:
 626               for (i=0; i<N; i++)
 627                  sum += a[i];
 628
 629    Example2: induction:
 630
 631               loop2:
 632               for (i=0; i<N; i++)
 633                  a[i] = i;  */
 634
 635 static void
 636 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 637 {
 638   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 639
 640   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 641
 642   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 643      Reductions in such inner-loop therefore have different properties than
 644      the reductions in the nest that gets vectorized:
 645      1. When vectorized, they are executed in the same order as in the original
 646         scalar loop, so we can't change the order of computation when
 647         vectorizing them.
 648      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 649         current checks are too strict.  */
 650
 651   if (loop->inner)
 652     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 653 }
 654
 655 /* Transfer group and reduction information from STMT_INFO to its
 656    pattern stmt.  */
 657
 658 static void
 659 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
 660 {
 661   stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
 662   stmt_vec_info stmtp;
 663   gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
 664               && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
 665   REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
 666   do
 667     {
 668       stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
 669       REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
 670       stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
 671       if (stmt_info)
 672         REDUC_GROUP_NEXT_ELEMENT (stmtp)
 673           = STMT_VINFO_RELATED_STMT (stmt_info);
 674     }
 675   while (stmt_info);
 676   STMT_VINFO_DEF_TYPE (stmtp) = vect_reduction_def;
 677 }
 678
 679 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 680
 681 static void
 682 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 683 {
 684   stmt_vec_info first;
 685   unsigned i;
 686
 687   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 688     if (STMT_VINFO_IN_PATTERN_P (first))
 689       {
 690         stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
 691         while (next)
 692           {
 693             if (! STMT_VINFO_IN_PATTERN_P (next))
 694               break;
 695             next = REDUC_GROUP_NEXT_ELEMENT (next);
 696           }
 697         /* If not all stmt in the chain are patterns try to handle
 698            the chain without patterns.  */
 699         if (! next)
 700           {
 701             vect_fixup_reduc_chain (first);
 702             LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
 703               = STMT_VINFO_RELATED_STMT (first);
 704           }
 705       }
 706 }
 707
 708 /* Function vect_get_loop_niters.
 709
 710    Determine how many iterations the loop is executed and place it
 711    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
 712    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
 713    niter information holds in ASSUMPTIONS.
 714
 715    Return the loop exit condition.  */
 716
 717
 718 static gcond *
 719 vect_get_loop_niters (class loop *loop, tree *assumptions,
 720                       tree *number_of_iterations, tree *number_of_iterationsm1)
 721 {
 722   edge exit = single_exit (loop);
 723   class tree_niter_desc niter_desc;
 724   tree niter_assumptions, niter, may_be_zero;
 725   gcond *cond = get_loop_exit_condition (loop);
 726
 727   *assumptions = boolean_true_node;
 728   *number_of_iterationsm1 = chrec_dont_know;
 729   *number_of_iterations = chrec_dont_know;
 730   DUMP_VECT_SCOPE ("get_loop_niters");
 731
 732   if (!exit)
 733     return cond;
 734
 735   may_be_zero = NULL_TREE;
 736   if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
 737       || chrec_contains_undetermined (niter_desc.niter))
 738     return cond;
 739
 740   niter_assumptions = niter_desc.assumptions;
 741   may_be_zero = niter_desc.may_be_zero;
 742   niter = niter_desc.niter;
 743
 744   if (may_be_zero && integer_zerop (may_be_zero))
 745     may_be_zero = NULL_TREE;
 746
 747   if (may_be_zero)
 748     {
 749       if (COMPARISON_CLASS_P (may_be_zero))
 750         {
 751           /* Try to combine may_be_zero with assumptions, this can simplify
 752              computation of niter expression.  */
 753           if (niter_assumptions && !integer_nonzerop (niter_assumptions))
 754             niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
 755                                              niter_assumptions,
 756                                              fold_build1 (TRUTH_NOT_EXPR,
 757                                                           boolean_type_node,
 758                                                           may_be_zero));
 759           else
 760             niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
 761                                  build_int_cst (TREE_TYPE (niter), 0),
 762                                  rewrite_to_non_trapping_overflow (niter));
 763
 764           may_be_zero = NULL_TREE;
 765         }
 766       else if (integer_nonzerop (may_be_zero))
 767         {
 768           *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
 769           *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
 770           return cond;
 771         }
 772       else
 773         return cond;
 774     }
 775
 776   *assumptions = niter_assumptions;
 777   *number_of_iterationsm1 = niter;
 778
 779   /* We want the number of loop header executions which is the number
 780      of latch executions plus one.
 781      ???  For UINT_MAX latch executions this number overflows to zero
 782      for loops like do { n++; } while (n != 0);  */
 783   if (niter && !chrec_contains_undetermined (niter))
 784     niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
 785                           build_int_cst (TREE_TYPE (niter), 1));
 786   *number_of_iterations = niter;
 787
 788   return cond;
 789 }
 790
 791 /* Function bb_in_loop_p
 792
 793    Used as predicate for dfs order traversal of the loop bbs.  */
 794
 795 static bool
 796 bb_in_loop_p (const_basic_block bb, const void *data)
 797 {
 798   const class loop *const loop = (const class loop *)data;
 799   if (flow_bb_inside_loop_p (loop, bb))
 800     return true;
 801   return false;
 802 }
 803
 804
 805 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
 806    stmt_vec_info structs for all the stmts in LOOP_IN.  */
 807
 808 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
 809   : vec_info (vec_info::loop, init_cost (loop_in), shared),
 810     loop (loop_in),
 811     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
 812     num_itersm1 (NULL_TREE),
 813     num_iters (NULL_TREE),
 814     num_iters_unchanged (NULL_TREE),
 815     num_iters_assumptions (NULL_TREE),
 816     th (0),
 817     versioning_threshold (0),
 818     vectorization_factor (0),
 819     max_vectorization_factor (0),
 820     mask_skip_niters (NULL_TREE),
 821     mask_compare_type (NULL_TREE),
 822     simd_if_cond (NULL_TREE),
 823     unaligned_dr (NULL),
 824     peeling_for_alignment (0),
 825     ptr_mask (0),
 826     ivexpr_map (NULL),
 827     scan_map (NULL),
 828     slp_unrolling_factor (1),
 829     single_scalar_iteration_cost (0),
 830     vectorizable (false),
 831     can_fully_mask_p (true),
 832     fully_masked_p (false),
 833     peeling_for_gaps (false),
 834     peeling_for_niter (false),
 835     no_data_dependencies (false),
 836     has_mask_store (false),
 837     scalar_loop_scaling (profile_probability::uninitialized ()),
 838     scalar_loop (NULL),
 839     orig_loop_info (NULL)
 840 {
 841   /* CHECKME: We want to visit all BBs before their successors (except for
 842      latch blocks, for which this assertion wouldn't hold).  In the simple
 843      case of the loop forms we allow, a dfs order of the BBs would the same
 844      as reversed postorder traversal, so we are safe.  */
 845
 846   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 847                                           bbs, loop->num_nodes, loop);
 848   gcc_assert (nbbs == loop->num_nodes);
 849
 850   for (unsigned int i = 0; i < nbbs; i++)
 851     {
 852       basic_block bb = bbs[i];
 853       gimple_stmt_iterator si;
 854
 855       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 856         {
 857           gimple *phi = gsi_stmt (si);
 858           gimple_set_uid (phi, 0);
 859           add_stmt (phi);
 860         }
 861
 862       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 863         {
 864           gimple *stmt = gsi_stmt (si);
 865           gimple_set_uid (stmt, 0);
 866           add_stmt (stmt);
 867           /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
 868              third argument is the #pragma omp simd if (x) condition, when 0,
 869              loop shouldn't be vectorized, when non-zero constant, it should
 870              be vectorized normally, otherwise versioned with vectorized loop
 871              done if the condition is non-zero at runtime.  */
 872           if (loop_in->simduid
 873               && is_gimple_call (stmt)
 874               && gimple_call_internal_p (stmt)
 875               && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
 876               && gimple_call_num_args (stmt) >= 3
 877               && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
 878               && (loop_in->simduid
 879                   == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
 880             {
 881               tree arg = gimple_call_arg (stmt, 2);
 882               if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
 883                 simd_if_cond = arg;
 884               else
 885                 gcc_assert (integer_nonzerop (arg));
 886             }
 887         }
 888     }
 889 }
 890
 891 /* Free all levels of MASKS.  */
 892
 893 void
 894 release_vec_loop_masks (vec_loop_masks *masks)
 895 {
 896   rgroup_masks *rgm;
 897   unsigned int i;
 898   FOR_EACH_VEC_ELT (*masks, i, rgm)
 899     rgm->masks.release ();
 900   masks->release ();
 901 }
 902
 903 /* Free all memory used by the _loop_vec_info, as well as all the
 904    stmt_vec_info structs of all the stmts in the loop.  */
 905
 906 _loop_vec_info::~_loop_vec_info ()
 907 {
 908   free (bbs);
 909
 910   release_vec_loop_masks (&masks);
 911   delete ivexpr_map;
 912   delete scan_map;
 913
 914   loop->aux = NULL;
 915 }
 916
 917 /* Return an invariant or register for EXPR and emit necessary
 918    computations in the LOOP_VINFO loop preheader.  */
 919
 920 tree
 921 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
 922 {
 923   if (is_gimple_reg (expr)
 924       || is_gimple_min_invariant (expr))
 925     return expr;
 926
 927   if (! loop_vinfo->ivexpr_map)
 928     loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
 929   tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
 930   if (! cached)
 931     {
 932       gimple_seq stmts = NULL;
 933       cached = force_gimple_operand (unshare_expr (expr),
 934                                      &stmts, true, NULL_TREE);
 935       if (stmts)
 936         {
 937           edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
 938           gsi_insert_seq_on_edge_immediate (e, stmts);
 939         }
 940     }
 941   return cached;
 942 }
 943
 944 /* Return true if we can use CMP_TYPE as the comparison type to produce
 945    all masks required to mask LOOP_VINFO.  */
 946
 947 static bool
 948 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
 949 {
 950   rgroup_masks *rgm;
 951   unsigned int i;
 952   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
 953     if (rgm->mask_type != NULL_TREE
 954         && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
 955                                             cmp_type, rgm->mask_type,
 956                                             OPTIMIZE_FOR_SPEED))
 957       return false;
 958   return true;
 959 }
 960
 961 /* Calculate the maximum number of scalars per iteration for every
 962    rgroup in LOOP_VINFO.  */
 963
 964 static unsigned int
 965 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
 966 {
 967   unsigned int res = 1;
 968   unsigned int i;
 969   rgroup_masks *rgm;
 970   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
 971     res = MAX (res, rgm->max_nscalars_per_iter);
 972   return res;
 973 }
 974
 975 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
 976    whether we can actually generate the masks required.  Return true if so,
 977    storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE.  */
 978
 979 static bool
 980 vect_verify_full_masking (loop_vec_info loop_vinfo)
 981 {
 982   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 983   unsigned int min_ni_width;
 984   unsigned int max_nscalars_per_iter
 985     = vect_get_max_nscalars_per_iter (loop_vinfo);
 986
 987   /* Use a normal loop if there are no statements that need masking.
 988      This only happens in rare degenerate cases: it means that the loop
 989      has no loads, no stores, and no live-out values.  */
 990   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
 991     return false;
 992
 993   /* Get the maximum number of iterations that is representable
 994      in the counter type.  */
 995   tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
 996   widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
 997
 998   /* Get a more refined estimate for the number of iterations.  */
 999   widest_int max_back_edges;
1000   if (max_loop_iterations (loop, &max_back_edges))
1001     max_ni = wi::smin (max_ni, max_back_edges + 1);
1002
1003   /* Account for rgroup masks, in which each bit is replicated N times.  */
1004   max_ni *= max_nscalars_per_iter;
1005
1006   /* Work out how many bits we need to represent the limit.  */
1007   min_ni_width = wi::min_precision (max_ni, UNSIGNED);
1008
1009   /* Find a scalar mode for which WHILE_ULT is supported.  */
1010   opt_scalar_int_mode cmp_mode_iter;
1011   tree cmp_type = NULL_TREE;
1012   tree iv_type = NULL_TREE;
1013   widest_int iv_limit = vect_iv_limit_for_full_masking (loop_vinfo);
1014   unsigned int iv_precision = UINT_MAX;
1015
1016   if (iv_limit != -1)
1017     iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1018                                       UNSIGNED);
1019
1020   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1021     {
1022       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1023       if (cmp_bits >= min_ni_width
1024           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1025         {
1026           tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1027           if (this_type
1028               && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1029             {
1030               /* Although we could stop as soon as we find a valid mode,
1031                  there are at least two reasons why that's not always the
1032                  best choice:
1033
1034                  - An IV that's Pmode or wider is more likely to be reusable
1035                    in address calculations than an IV that's narrower than
1036                    Pmode.
1037
1038                  - Doing the comparison in IV_PRECISION or wider allows
1039                    a natural 0-based IV, whereas using a narrower comparison
1040                    type requires mitigations against wrap-around.
1041
1042                  Conversely, if the IV limit is variable, doing the comparison
1043                  in a wider type than the original type can introduce
1044                  unnecessary extensions, so picking the widest valid mode
1045                  is not always a good choice either.
1046
1047                  Here we prefer the first IV type that's Pmode or wider,
1048                  and the first comparison type that's IV_PRECISION or wider.
1049                  (The comparison type must be no wider than the IV type,
1050                  to avoid extensions in the vector loop.)
1051
1052                  ??? We might want to try continuing beyond Pmode for ILP32
1053                  targets if CMP_BITS < IV_PRECISION.  */
1054               iv_type = this_type;
1055               if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1056                 cmp_type = this_type;
1057               if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1058                 break;
1059             }
1060         }
1061     }
1062
1063   if (!cmp_type)
1064     return false;
1065
1066   LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
1067   LOOP_VINFO_MASK_IV_TYPE (loop_vinfo) = iv_type;
1068   return true;
1069 }
1070
1071 /* Calculate the cost of one scalar iteration of the loop.  */
1072 static void
1073 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1074 {
1075   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1076   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1077   int nbbs = loop->num_nodes, factor;
1078   int innerloop_iters, i;
1079
1080   DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1081
1082   /* Gather costs for statements in the scalar loop.  */
1083
1084   /* FORNOW.  */
1085   innerloop_iters = 1;
1086   if (loop->inner)
1087     innerloop_iters = 50; /* FIXME */
1088
1089   for (i = 0; i < nbbs; i++)
1090     {
1091       gimple_stmt_iterator si;
1092       basic_block bb = bbs[i];
1093
1094       if (bb->loop_father == loop->inner)
1095         factor = innerloop_iters;
1096       else
1097         factor = 1;
1098
1099       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1100         {
1101           gimple *stmt = gsi_stmt (si);
1102           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1103
1104           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1105             continue;
1106
1107           /* Skip stmts that are not vectorized inside the loop.  */
1108           stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1109           if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1110               && (!STMT_VINFO_LIVE_P (vstmt_info)
1111                   || !VECTORIZABLE_CYCLE_DEF
1112                         (STMT_VINFO_DEF_TYPE (vstmt_info))))
1113             continue;
1114
1115           vect_cost_for_stmt kind;
1116           if (STMT_VINFO_DATA_REF (stmt_info))
1117             {
1118               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1119                kind = scalar_load;
1120              else
1121                kind = scalar_store;
1122             }
1123           else
1124             kind = scalar_stmt;
1125
1126           record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1127                             factor, kind, stmt_info, 0, vect_prologue);
1128         }
1129     }
1130
1131   /* Now accumulate cost.  */
1132   void *target_cost_data = init_cost (loop);
1133   stmt_info_for_cost *si;
1134   int j;
1135   FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1136                     j, si)
1137     (void) add_stmt_cost (target_cost_data, si->count,
1138                           si->kind, si->stmt_info, si->misalign,
1139                           vect_body);
1140   unsigned dummy, body_cost = 0;
1141   finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
1142   destroy_cost_data (target_cost_data);
1143   LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
1144 }
1145
1146
1147 /* Function vect_analyze_loop_form_1.
1148
1149    Verify that certain CFG restrictions hold, including:
1150    - the loop has a pre-header
1151    - the loop has a single entry and exit
1152    - the loop exit condition is simple enough
1153    - the number of iterations can be analyzed, i.e, a countable loop.  The
1154      niter could be analyzed under some assumptions.  */
1155
1156 opt_result
1157 vect_analyze_loop_form_1 (class loop *loop, gcond **loop_cond,
1158                           tree *assumptions, tree *number_of_iterationsm1,
1159                           tree *number_of_iterations, gcond **inner_loop_cond)
1160 {
1161   DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1162
1163   /* Different restrictions apply when we are considering an inner-most loop,
1164      vs. an outer (nested) loop.
1165      (FORNOW. May want to relax some of these restrictions in the future).  */
1166
1167   if (!loop->inner)
1168     {
1169       /* Inner-most loop.  We currently require that the number of BBs is
1170          exactly 2 (the header and latch).  Vectorizable inner-most loops
1171          look like this:
1172
1173                         (pre-header)
1174                            |
1175                           header <--------+
1176                            | |            |
1177                            | +--> latch --+
1178                            |
1179                         (exit-bb)  */
1180
1181       if (loop->num_nodes != 2)
1182         return opt_result::failure_at (vect_location,
1183                                        "not vectorized:"
1184                                        " control flow in loop.\n");
1185
1186       if (empty_block_p (loop->header))
1187         return opt_result::failure_at (vect_location,
1188                                        "not vectorized: empty loop.\n");
1189     }
1190   else
1191     {
1192       class loop *innerloop = loop->inner;
1193       edge entryedge;
1194
1195       /* Nested loop. We currently require that the loop is doubly-nested,
1196          contains a single inner loop, and the number of BBs is exactly 5.
1197          Vectorizable outer-loops look like this:
1198
1199                         (pre-header)
1200                            |
1201                           header <---+
1202                            |         |
1203                           inner-loop |
1204                            |         |
1205                           tail ------+
1206                            |
1207                         (exit-bb)
1208
1209          The inner-loop has the properties expected of inner-most loops
1210          as described above.  */
1211
1212       if ((loop->inner)->inner || (loop->inner)->next)
1213         return opt_result::failure_at (vect_location,
1214                                        "not vectorized:"
1215                                        " multiple nested loops.\n");
1216
1217       if (loop->num_nodes != 5)
1218         return opt_result::failure_at (vect_location,
1219                                        "not vectorized:"
1220                                        " control flow in loop.\n");
1221
1222       entryedge = loop_preheader_edge (innerloop);
1223       if (entryedge->src != loop->header
1224           || !single_exit (innerloop)
1225           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1226         return opt_result::failure_at (vect_location,
1227                                        "not vectorized:"
1228                                        " unsupported outerloop form.\n");
1229
1230       /* Analyze the inner-loop.  */
1231       tree inner_niterm1, inner_niter, inner_assumptions;
1232       opt_result res
1233         = vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1234                                     &inner_assumptions, &inner_niterm1,
1235                                     &inner_niter, NULL);
1236       if (!res)
1237         {
1238           if (dump_enabled_p ())
1239             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1240                              "not vectorized: Bad inner loop.\n");
1241           return res;
1242         }
1243
1244       /* Don't support analyzing niter under assumptions for inner
1245          loop.  */
1246       if (!integer_onep (inner_assumptions))
1247         return opt_result::failure_at (vect_location,
1248                                        "not vectorized: Bad inner loop.\n");
1249
1250       if (!expr_invariant_in_loop_p (loop, inner_niter))
1251         return opt_result::failure_at (vect_location,
1252                                        "not vectorized: inner-loop count not"
1253                                        " invariant.\n");
1254
1255       if (dump_enabled_p ())
1256         dump_printf_loc (MSG_NOTE, vect_location,
1257                          "Considering outer-loop vectorization.\n");
1258     }
1259
1260   if (!single_exit (loop))
1261     return opt_result::failure_at (vect_location,
1262                                    "not vectorized: multiple exits.\n");
1263   if (EDGE_COUNT (loop->header->preds) != 2)
1264     return opt_result::failure_at (vect_location,
1265                                    "not vectorized:"
1266                                    " too many incoming edges.\n");
1267
1268   /* We assume that the loop exit condition is at the end of the loop. i.e,
1269      that the loop is represented as a do-while (with a proper if-guard
1270      before the loop if needed), where the loop header contains all the
1271      executable statements, and the latch is empty.  */
1272   if (!empty_block_p (loop->latch)
1273       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1274     return opt_result::failure_at (vect_location,
1275                                    "not vectorized: latch block not empty.\n");
1276
1277   /* Make sure the exit is not abnormal.  */
1278   edge e = single_exit (loop);
1279   if (e->flags & EDGE_ABNORMAL)
1280     return opt_result::failure_at (vect_location,
1281                                    "not vectorized:"
1282                                    " abnormal loop exit edge.\n");
1283
1284   *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1285                                      number_of_iterationsm1);
1286   if (!*loop_cond)
1287     return opt_result::failure_at
1288       (vect_location,
1289        "not vectorized: complicated exit condition.\n");
1290
1291   if (integer_zerop (*assumptions)
1292       || !*number_of_iterations
1293       || chrec_contains_undetermined (*number_of_iterations))
1294     return opt_result::failure_at
1295       (*loop_cond,
1296        "not vectorized: number of iterations cannot be computed.\n");
1297
1298   if (integer_zerop (*number_of_iterations))
1299     return opt_result::failure_at
1300       (*loop_cond,
1301        "not vectorized: number of iterations = 0.\n");
1302
1303   return opt_result::success ();
1304 }
1305
1306 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form.  */
1307
1308 opt_loop_vec_info
1309 vect_analyze_loop_form (class loop *loop, vec_info_shared *shared)
1310 {
1311   tree assumptions, number_of_iterations, number_of_iterationsm1;
1312   gcond *loop_cond, *inner_loop_cond = NULL;
1313
1314   opt_result res
1315     = vect_analyze_loop_form_1 (loop, &loop_cond,
1316                                 &assumptions, &number_of_iterationsm1,
1317                                 &number_of_iterations, &inner_loop_cond);
1318   if (!res)
1319     return opt_loop_vec_info::propagate_failure (res);
1320
1321   loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1322   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1323   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1324   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1325   if (!integer_onep (assumptions))
1326     {
1327       /* We consider to vectorize this loop by versioning it under
1328          some assumptions.  In order to do this, we need to clear
1329          existing information computed by scev and niter analyzer.  */
1330       scev_reset_htab ();
1331       free_numbers_of_iterations_estimates (loop);
1332       /* Also set flag for this loop so that following scev and niter
1333          analysis are done under the assumptions.  */
1334       loop_constraint_set (loop, LOOP_C_FINITE);
1335       /* Also record the assumptions for versioning.  */
1336       LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1337     }
1338
1339   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1340     {
1341       if (dump_enabled_p ())
1342         {
1343           dump_printf_loc (MSG_NOTE, vect_location,
1344                            "Symbolic number of iterations is ");
1345           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1346           dump_printf (MSG_NOTE, "\n");
1347         }
1348     }
1349
1350   stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (loop_cond);
1351   STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1352   if (inner_loop_cond)
1353     {
1354       stmt_vec_info inner_loop_cond_info
1355         = loop_vinfo->lookup_stmt (inner_loop_cond);
1356       STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1357     }
1358
1359   gcc_assert (!loop->aux);
1360   loop->aux = loop_vinfo;
1361   return opt_loop_vec_info::success (loop_vinfo);
1362 }
1363
1364
1365
1366 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1367    statements update the vectorization factor.  */
1368
1369 static void
1370 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1371 {
1372   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1373   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1374   int nbbs = loop->num_nodes;
1375   poly_uint64 vectorization_factor;
1376   int i;
1377
1378   DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1379
1380   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1381   gcc_assert (known_ne (vectorization_factor, 0U));
1382
1383   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1384      vectorization factor of the loop is the unrolling factor required by
1385      the SLP instances.  If that unrolling factor is 1, we say, that we
1386      perform pure SLP on loop - cross iteration parallelism is not
1387      exploited.  */
1388   bool only_slp_in_loop = true;
1389   for (i = 0; i < nbbs; i++)
1390     {
1391       basic_block bb = bbs[i];
1392       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1393            gsi_next (&si))
1394         {
1395           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1396           stmt_info = vect_stmt_to_vectorize (stmt_info);
1397           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1398                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1399               && !PURE_SLP_STMT (stmt_info))
1400             /* STMT needs both SLP and loop-based vectorization.  */
1401             only_slp_in_loop = false;
1402         }
1403     }
1404
1405   if (only_slp_in_loop)
1406     {
1407       if (dump_enabled_p ())
1408         dump_printf_loc (MSG_NOTE, vect_location,
1409                          "Loop contains only SLP stmts\n");
1410       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1411     }
1412   else
1413     {
1414       if (dump_enabled_p ())
1415         dump_printf_loc (MSG_NOTE, vect_location,
1416                          "Loop contains SLP and non-SLP stmts\n");
1417       /* Both the vectorization factor and unroll factor have the form
1418          current_vector_size * X for some rational X, so they must have
1419          a common multiple.  */
1420       vectorization_factor
1421         = force_common_multiple (vectorization_factor,
1422                                  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1423     }
1424
1425   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1426   if (dump_enabled_p ())
1427     {
1428       dump_printf_loc (MSG_NOTE, vect_location,
1429                        "Updating vectorization factor to ");
1430       dump_dec (MSG_NOTE, vectorization_factor);
1431       dump_printf (MSG_NOTE, ".\n");
1432     }
1433 }
1434
1435 /* Return true if STMT_INFO describes a double reduction phi and if
1436    the other phi in the reduction is also relevant for vectorization.
1437    This rejects cases such as:
1438
1439       outer1:
1440         x_1 = PHI <x_3(outer2), ...>;
1441         ...
1442
1443       inner:
1444         x_2 = ...;
1445         ...
1446
1447       outer2:
1448         x_3 = PHI <x_2(inner)>;
1449
1450    if nothing in x_2 or elsewhere makes x_1 relevant.  */
1451
1452 static bool
1453 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1454 {
1455   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1456     return false;
1457
1458   return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1459 }
1460
1461 /* Function vect_analyze_loop_operations.
1462
1463    Scan the loop stmts and make sure they are all vectorizable.  */
1464
1465 static opt_result
1466 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1467 {
1468   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1469   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1470   int nbbs = loop->num_nodes;
1471   int i;
1472   stmt_vec_info stmt_info;
1473   bool need_to_vectorize = false;
1474   bool ok;
1475
1476   DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1477
1478   auto_vec<stmt_info_for_cost> cost_vec;
1479
1480   for (i = 0; i < nbbs; i++)
1481     {
1482       basic_block bb = bbs[i];
1483
1484       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1485            gsi_next (&si))
1486         {
1487           gphi *phi = si.phi ();
1488           ok = true;
1489
1490           stmt_info = loop_vinfo->lookup_stmt (phi);
1491           if (dump_enabled_p ())
1492             dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G", phi);
1493           if (virtual_operand_p (gimple_phi_result (phi)))
1494             continue;
1495
1496           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1497              (i.e., a phi in the tail of the outer-loop).  */
1498           if (! is_loop_header_bb_p (bb))
1499             {
1500               /* FORNOW: we currently don't support the case that these phis
1501                  are not used in the outerloop (unless it is double reduction,
1502                  i.e., this phi is vect_reduction_def), cause this case
1503                  requires to actually do something here.  */
1504               if (STMT_VINFO_LIVE_P (stmt_info)
1505                   && !vect_active_double_reduction_p (stmt_info))
1506                 return opt_result::failure_at (phi,
1507                                                "Unsupported loop-closed phi"
1508                                                " in outer-loop.\n");
1509
1510               /* If PHI is used in the outer loop, we check that its operand
1511                  is defined in the inner loop.  */
1512               if (STMT_VINFO_RELEVANT_P (stmt_info))
1513                 {
1514                   tree phi_op;
1515
1516                   if (gimple_phi_num_args (phi) != 1)
1517                     return opt_result::failure_at (phi, "unsupported phi");
1518
1519                   phi_op = PHI_ARG_DEF (phi, 0);
1520                   stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1521                   if (!op_def_info)
1522                     return opt_result::failure_at (phi, "unsupported phi\n");
1523
1524                   if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1525                       && (STMT_VINFO_RELEVANT (op_def_info)
1526                           != vect_used_in_outer_by_reduction))
1527                     return opt_result::failure_at (phi, "unsupported phi\n");
1528
1529                   if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
1530                        || (STMT_VINFO_DEF_TYPE (stmt_info)
1531                            == vect_double_reduction_def))
1532                       && !vectorizable_lc_phi (stmt_info, NULL, NULL))
1533                     return opt_result::failure_at (phi, "unsupported phi\n");
1534                 }
1535
1536               continue;
1537             }
1538
1539           gcc_assert (stmt_info);
1540
1541           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1542                || STMT_VINFO_LIVE_P (stmt_info))
1543               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1544             /* A scalar-dependence cycle that we don't support.  */
1545             return opt_result::failure_at (phi,
1546                                            "not vectorized:"
1547                                            " scalar dependence cycle.\n");
1548
1549           if (STMT_VINFO_RELEVANT_P (stmt_info))
1550             {
1551               need_to_vectorize = true;
1552               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1553                   && ! PURE_SLP_STMT (stmt_info))
1554                 ok = vectorizable_induction (stmt_info, NULL, NULL, NULL,
1555                                              &cost_vec);
1556               else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1557                         || (STMT_VINFO_DEF_TYPE (stmt_info)
1558                             == vect_double_reduction_def)
1559                         || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1560                        && ! PURE_SLP_STMT (stmt_info))
1561                 ok = vectorizable_reduction (stmt_info, NULL, NULL, NULL, NULL,
1562                                              &cost_vec);
1563             }
1564
1565           /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
1566           if (ok
1567               && STMT_VINFO_LIVE_P (stmt_info)
1568               && !PURE_SLP_STMT (stmt_info))
1569             ok = vectorizable_live_operation (stmt_info, NULL, NULL, NULL,
1570                                               -1, NULL, &cost_vec);
1571
1572           if (!ok)
1573             return opt_result::failure_at (phi,
1574                                            "not vectorized: relevant phi not "
1575                                            "supported: %G",
1576                                            static_cast <gimple *> (phi));
1577         }
1578
1579       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1580            gsi_next (&si))
1581         {
1582           gimple *stmt = gsi_stmt (si);
1583           if (!gimple_clobber_p (stmt))
1584             {
1585               opt_result res
1586                 = vect_analyze_stmt (loop_vinfo->lookup_stmt (stmt),
1587                                      &need_to_vectorize,
1588                                      NULL, NULL, &cost_vec);
1589               if (!res)
1590                 return res;
1591             }
1592         }
1593     } /* bbs */
1594
1595   add_stmt_costs (loop_vinfo->target_cost_data, &cost_vec);
1596
1597   /* All operations in the loop are either irrelevant (deal with loop
1598      control, or dead), or only used outside the loop and can be moved
1599      out of the loop (e.g. invariants, inductions).  The loop can be
1600      optimized away by scalar optimizations.  We're better off not
1601      touching this loop.  */
1602   if (!need_to_vectorize)
1603     {
1604       if (dump_enabled_p ())
1605         dump_printf_loc (MSG_NOTE, vect_location,
1606                          "All the computation can be taken out of the loop.\n");
1607       return opt_result::failure_at
1608         (vect_location,
1609          "not vectorized: redundant loop. no profit to vectorize.\n");
1610     }
1611
1612   return opt_result::success ();
1613 }
1614
1615 /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
1616    is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
1617    definitely no, or -1 if it's worth retrying.  */
1618
1619 static int
1620 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1621 {
1622   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1623   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1624
1625   /* Only fully-masked loops can have iteration counts less than the
1626      vectorization factor.  */
1627   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1628     {
1629       HOST_WIDE_INT max_niter;
1630
1631       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1632         max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1633       else
1634         max_niter = max_stmt_executions_int (loop);
1635
1636       if (max_niter != -1
1637           && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1638         {
1639           if (dump_enabled_p ())
1640             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1641                              "not vectorized: iteration count smaller than "
1642                              "vectorization factor.\n");
1643           return 0;
1644         }
1645     }
1646
1647   int min_profitable_iters, min_profitable_estimate;
1648   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1649                                       &min_profitable_estimate);
1650
1651   if (min_profitable_iters < 0)
1652     {
1653       if (dump_enabled_p ())
1654         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1655                          "not vectorized: vectorization not profitable.\n");
1656       if (dump_enabled_p ())
1657         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1658                          "not vectorized: vector version will never be "
1659                          "profitable.\n");
1660       return -1;
1661     }
1662
1663   int min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1664                                * assumed_vf);
1665
1666   /* Use the cost model only if it is more conservative than user specified
1667      threshold.  */
1668   unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1669                                     min_profitable_iters);
1670
1671   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1672
1673   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1674       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1675     {
1676       if (dump_enabled_p ())
1677         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1678                          "not vectorized: vectorization not profitable.\n");
1679       if (dump_enabled_p ())
1680         dump_printf_loc (MSG_NOTE, vect_location,
1681                          "not vectorized: iteration count smaller than user "
1682                          "specified loop bound parameter or minimum profitable "
1683                          "iterations (whichever is more conservative).\n");
1684       return 0;
1685     }
1686
1687   HOST_WIDE_INT estimated_niter = estimated_stmt_executions_int (loop);
1688   if (estimated_niter == -1)
1689     estimated_niter = likely_max_stmt_executions_int (loop);
1690   if (estimated_niter != -1
1691       && ((unsigned HOST_WIDE_INT) estimated_niter
1692           < MAX (th, (unsigned) min_profitable_estimate)))
1693     {
1694       if (dump_enabled_p ())
1695         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1696                          "not vectorized: estimated iteration count too "
1697                          "small.\n");
1698       if (dump_enabled_p ())
1699         dump_printf_loc (MSG_NOTE, vect_location,
1700                          "not vectorized: estimated iteration count smaller "
1701                          "than specified loop bound parameter or minimum "
1702                          "profitable iterations (whichever is more "
1703                          "conservative).\n");
1704       return -1;
1705     }
1706
1707   return 1;
1708 }
1709
1710 static opt_result
1711 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1712                            vec<data_reference_p> *datarefs,
1713                            unsigned int *n_stmts)
1714 {
1715   *n_stmts = 0;
1716   for (unsigned i = 0; i < loop->num_nodes; i++)
1717     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1718          !gsi_end_p (gsi); gsi_next (&gsi))
1719       {
1720         gimple *stmt = gsi_stmt (gsi);
1721         if (is_gimple_debug (stmt))
1722           continue;
1723         ++(*n_stmts);
1724         opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs);
1725         if (!res)
1726           {
1727             if (is_gimple_call (stmt) && loop->safelen)
1728               {
1729                 tree fndecl = gimple_call_fndecl (stmt), op;
1730                 if (fndecl != NULL_TREE)
1731                   {
1732                     cgraph_node *node = cgraph_node::get (fndecl);
1733                     if (node != NULL && node->simd_clones != NULL)
1734                       {
1735                         unsigned int j, n = gimple_call_num_args (stmt);
1736                         for (j = 0; j < n; j++)
1737                           {
1738                             op = gimple_call_arg (stmt, j);
1739                             if (DECL_P (op)
1740                                 || (REFERENCE_CLASS_P (op)
1741                                     && get_base_address (op)))
1742                               break;
1743                           }
1744                         op = gimple_call_lhs (stmt);
1745                         /* Ignore #pragma omp declare simd functions
1746                            if they don't have data references in the
1747                            call stmt itself.  */
1748                         if (j == n
1749                             && !(op
1750                                  && (DECL_P (op)
1751                                      || (REFERENCE_CLASS_P (op)
1752                                          && get_base_address (op)))))
1753                           continue;
1754                       }
1755                   }
1756               }
1757             return res;
1758           }
1759         /* If dependence analysis will give up due to the limit on the
1760            number of datarefs stop here and fail fatally.  */
1761         if (datarefs->length ()
1762             > (unsigned)PARAM_VALUE (PARAM_LOOP_MAX_DATAREFS_FOR_DATADEPS))
1763           return opt_result::failure_at (stmt, "exceeded param "
1764                                          "loop-max-datarefs-for-datadeps\n");
1765       }
1766   return opt_result::success ();
1767 }
1768
1769 /* Look for SLP-only access groups and turn each individual access into its own
1770    group.  */
1771 static void
1772 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
1773 {
1774   unsigned int i;
1775   struct data_reference *dr;
1776
1777   DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
1778
1779   vec<data_reference_p> datarefs = loop_vinfo->shared->datarefs;
1780   FOR_EACH_VEC_ELT (datarefs, i, dr)
1781     {
1782       gcc_assert (DR_REF (dr));
1783       stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
1784
1785       /* Check if the load is a part of an interleaving chain.  */
1786       if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1787         {
1788           stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
1789           unsigned int group_size = DR_GROUP_SIZE (first_element);
1790
1791           /* Check if SLP-only groups.  */
1792           if (!STMT_SLP_TYPE (stmt_info)
1793               && STMT_VINFO_SLP_VECT_ONLY (first_element))
1794             {
1795               /* Dissolve the group.  */
1796               STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
1797
1798               stmt_vec_info vinfo = first_element;
1799               while (vinfo)
1800                 {
1801                   stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
1802                   DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
1803                   DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
1804                   DR_GROUP_SIZE (vinfo) = 1;
1805                   DR_GROUP_GAP (vinfo) = group_size - 1;
1806                   vinfo = next;
1807                 }
1808             }
1809         }
1810     }
1811 }
1812
1813 /* Function vect_analyze_loop_2.
1814
1815    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1816    for it.  The different analyses will record information in the
1817    loop_vec_info struct.  */
1818 static opt_result
1819 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
1820 {
1821   opt_result ok = opt_result::success ();
1822   int res;
1823   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
1824   poly_uint64 min_vf = 2;
1825
1826   /* The first group of checks is independent of the vector size.  */
1827   fatal = true;
1828
1829   if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
1830       && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
1831     return opt_result::failure_at (vect_location,
1832                                    "not vectorized: simd if(0)\n");
1833
1834   /* Find all data references in the loop (which correspond to vdefs/vuses)
1835      and analyze their evolution in the loop.  */
1836
1837   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1838
1839   /* Gather the data references and count stmts in the loop.  */
1840   if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
1841     {
1842       opt_result res
1843         = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
1844                                      &LOOP_VINFO_DATAREFS (loop_vinfo),
1845                                      n_stmts);
1846       if (!res)
1847         {
1848           if (dump_enabled_p ())
1849             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1850                              "not vectorized: loop contains function "
1851                              "calls or data references that cannot "
1852                              "be analyzed\n");
1853           return res;
1854         }
1855       loop_vinfo->shared->save_datarefs ();
1856     }
1857   else
1858     loop_vinfo->shared->check_datarefs ();
1859
1860   /* Analyze the data references and also adjust the minimal
1861      vectorization factor according to the loads and stores.  */
1862
1863   ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
1864   if (!ok)
1865     {
1866       if (dump_enabled_p ())
1867         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1868                          "bad data references.\n");
1869       return ok;
1870     }
1871
1872   /* Classify all cross-iteration scalar data-flow cycles.
1873      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1874   vect_analyze_scalar_cycles (loop_vinfo);
1875
1876   vect_pattern_recog (loop_vinfo);
1877
1878   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1879
1880   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1881      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1882
1883   ok = vect_analyze_data_ref_accesses (loop_vinfo);
1884   if (!ok)
1885     {
1886       if (dump_enabled_p ())
1887         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1888                          "bad data access.\n");
1889       return ok;
1890     }
1891
1892   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1893
1894   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
1895   if (!ok)
1896     {
1897       if (dump_enabled_p ())
1898         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1899                          "unexpected pattern.\n");
1900       return ok;
1901     }
1902
1903   /* While the rest of the analysis below depends on it in some way.  */
1904   fatal = false;
1905
1906   /* Analyze data dependences between the data-refs in the loop
1907      and adjust the maximum vectorization factor according to
1908      the dependences.
1909      FORNOW: fail at the first data dependence that we encounter.  */
1910
1911   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1912   if (!ok)
1913     {
1914       if (dump_enabled_p ())
1915         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1916                          "bad data dependence.\n");
1917       return ok;
1918     }
1919   if (max_vf != MAX_VECTORIZATION_FACTOR
1920       && maybe_lt (max_vf, min_vf))
1921     return opt_result::failure_at (vect_location, "bad data dependence.\n");
1922   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
1923
1924   ok = vect_determine_vectorization_factor (loop_vinfo);
1925   if (!ok)
1926     {
1927       if (dump_enabled_p ())
1928         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1929                          "can't determine vectorization factor.\n");
1930       return ok;
1931     }
1932   if (max_vf != MAX_VECTORIZATION_FACTOR
1933       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1934     return opt_result::failure_at (vect_location, "bad data dependence.\n");
1935
1936   /* Compute the scalar iteration cost.  */
1937   vect_compute_single_scalar_iteration_cost (loop_vinfo);
1938
1939   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1940   unsigned th;
1941
1942   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
1943   ok = vect_analyze_slp (loop_vinfo, *n_stmts);
1944   if (!ok)
1945     return ok;
1946
1947   /* If there are any SLP instances mark them as pure_slp.  */
1948   bool slp = vect_make_slp_decision (loop_vinfo);
1949   if (slp)
1950     {
1951       /* Find stmts that need to be both vectorized and SLPed.  */
1952       vect_detect_hybrid_slp (loop_vinfo);
1953
1954       /* Update the vectorization factor based on the SLP decision.  */
1955       vect_update_vf_for_slp (loop_vinfo);
1956     }
1957
1958   bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo);
1959
1960   /* We don't expect to have to roll back to anything other than an empty
1961      set of rgroups.  */
1962   gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
1963
1964   /* This is the point where we can re-start analysis with SLP forced off.  */
1965 start_over:
1966
1967   /* Now the vectorization factor is final.  */
1968   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1969   gcc_assert (known_ne (vectorization_factor, 0U));
1970
1971   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1972     {
1973       dump_printf_loc (MSG_NOTE, vect_location,
1974                        "vectorization_factor = ");
1975       dump_dec (MSG_NOTE, vectorization_factor);
1976       dump_printf (MSG_NOTE, ", niters = %wd\n",
1977                    LOOP_VINFO_INT_NITERS (loop_vinfo));
1978     }
1979
1980   HOST_WIDE_INT max_niter
1981     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1982
1983   /* Analyze the alignment of the data-refs in the loop.
1984      Fail if a data reference is found that cannot be vectorized.  */
1985
1986   ok = vect_analyze_data_refs_alignment (loop_vinfo);
1987   if (!ok)
1988     {
1989       if (dump_enabled_p ())
1990         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1991                          "bad data alignment.\n");
1992       return ok;
1993     }
1994
1995   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1996      It is important to call pruning after vect_analyze_data_ref_accesses,
1997      since we use grouping information gathered by interleaving analysis.  */
1998   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1999   if (!ok)
2000     return ok;
2001
2002   /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2003      vectorization, since we do not want to add extra peeling or
2004      add versioning for alignment.  */
2005   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2006     /* This pass will decide on using loop versioning and/or loop peeling in
2007        order to enhance the alignment of data references in the loop.  */
2008     ok = vect_enhance_data_refs_alignment (loop_vinfo);
2009   else
2010     ok = vect_verify_datarefs_alignment (loop_vinfo);
2011   if (!ok)
2012     return ok;
2013
2014   if (slp)
2015     {
2016       /* Analyze operations in the SLP instances.  Note this may
2017          remove unsupported SLP instances which makes the above
2018          SLP kind detection invalid.  */
2019       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2020       vect_slp_analyze_operations (loop_vinfo);
2021       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2022         {
2023           ok = opt_result::failure_at (vect_location,
2024                                        "unsupported SLP instances\n");
2025           goto again;
2026         }
2027     }
2028
2029   /* Dissolve SLP-only groups.  */
2030   vect_dissolve_slp_only_groups (loop_vinfo);
2031
2032   /* Scan all the remaining operations in the loop that are not subject
2033      to SLP and make sure they are vectorizable.  */
2034   ok = vect_analyze_loop_operations (loop_vinfo);
2035   if (!ok)
2036     {
2037       if (dump_enabled_p ())
2038         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2039                          "bad operation or unsupported loop bound.\n");
2040       return ok;
2041     }
2042
2043   /* Decide whether to use a fully-masked loop for this vectorization
2044      factor.  */
2045   LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2046     = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
2047        && vect_verify_full_masking (loop_vinfo));
2048   if (dump_enabled_p ())
2049     {
2050       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2051         dump_printf_loc (MSG_NOTE, vect_location,
2052                          "using a fully-masked loop.\n");
2053       else
2054         dump_printf_loc (MSG_NOTE, vect_location,
2055                          "not using a fully-masked loop.\n");
2056     }
2057
2058   /* If epilog loop is required because of data accesses with gaps,
2059      one additional iteration needs to be peeled.  Check if there is
2060      enough iterations for vectorization.  */
2061   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2062       && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2063       && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2064     {
2065       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2066       tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2067
2068       if (known_lt (wi::to_widest (scalar_niters), vf))
2069         return opt_result::failure_at (vect_location,
2070                                        "loop has no enough iterations to"
2071                                        " support peeling for gaps.\n");
2072     }
2073
2074   /* Check the costings of the loop make vectorizing worthwhile.  */
2075   res = vect_analyze_loop_costing (loop_vinfo);
2076   if (res < 0)
2077     {
2078       ok = opt_result::failure_at (vect_location,
2079                                    "Loop costings may not be worthwhile.\n");
2080       goto again;
2081     }
2082   if (!res)
2083     return opt_result::failure_at (vect_location,
2084                                    "Loop costings not worthwhile.\n");
2085
2086   /* Decide whether we need to create an epilogue loop to handle
2087      remaining scalar iterations.  */
2088   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2089
2090   unsigned HOST_WIDE_INT const_vf;
2091   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2092     /* The main loop handles all iterations.  */
2093     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2094   else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2095            && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
2096     {
2097       /* Work out the (constant) number of iterations that need to be
2098          peeled for reasons other than niters.  */
2099       unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2100       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2101         peel_niter += 1;
2102       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
2103                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2104         LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2105     }
2106   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2107            /* ??? When peeling for gaps but not alignment, we could
2108               try to check whether the (variable) niters is known to be
2109               VF * N + 1.  That's something of a niche case though.  */
2110            || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2111            || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
2112            || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2113                 < (unsigned) exact_log2 (const_vf))
2114                /* In case of versioning, check if the maximum number of
2115                   iterations is greater than th.  If they are identical,
2116                   the epilogue is unnecessary.  */
2117                && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2118                    || ((unsigned HOST_WIDE_INT) max_niter
2119                        > (th / const_vf) * const_vf))))
2120     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2121
2122   /* If an epilogue loop is required make sure we can create one.  */
2123   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2124       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2125     {
2126       if (dump_enabled_p ())
2127         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2128       if (!vect_can_advance_ivs_p (loop_vinfo)
2129           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2130                                            single_exit (LOOP_VINFO_LOOP
2131                                                          (loop_vinfo))))
2132         {
2133           ok = opt_result::failure_at (vect_location,
2134                                        "not vectorized: can't create required "
2135                                        "epilog loop\n");
2136           goto again;
2137         }
2138     }
2139
2140   /* During peeling, we need to check if number of loop iterations is
2141      enough for both peeled prolog loop and vector loop.  This check
2142      can be merged along with threshold check of loop versioning, so
2143      increase threshold for this case if necessary.  */
2144   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
2145     {
2146       poly_uint64 niters_th = 0;
2147
2148       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2149         {
2150           /* Niters for peeled prolog loop.  */
2151           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2152             {
2153               dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2154               tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2155               niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2156             }
2157           else
2158             niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2159         }
2160
2161       /* Niters for at least one iteration of vectorized loop.  */
2162       if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2163         niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2164       /* One additional iteration because of peeling for gap.  */
2165       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2166         niters_th += 1;
2167       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2168     }
2169
2170   gcc_assert (known_eq (vectorization_factor,
2171                         LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2172
2173   /* Ok to vectorize!  */
2174   return opt_result::success ();
2175
2176 again:
2177   /* Ensure that "ok" is false (with an opt_problem if dumping is enabled).  */
2178   gcc_assert (!ok);
2179
2180   /* Try again with SLP forced off but if we didn't do any SLP there is
2181      no point in re-trying.  */
2182   if (!slp)
2183     return ok;
2184
2185   /* If there are reduction chains re-trying will fail anyway.  */
2186   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2187     return ok;
2188
2189   /* Likewise if the grouped loads or stores in the SLP cannot be handled
2190      via interleaving or lane instructions.  */
2191   slp_instance instance;
2192   slp_tree node;
2193   unsigned i, j;
2194   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2195     {
2196       stmt_vec_info vinfo;
2197       vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2198       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2199         continue;
2200       vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2201       unsigned int size = DR_GROUP_SIZE (vinfo);
2202       tree vectype = STMT_VINFO_VECTYPE (vinfo);
2203       if (! vect_store_lanes_supported (vectype, size, false)
2204          && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2205          && ! vect_grouped_store_supported (vectype, size))
2206         return opt_result::failure_at (vinfo->stmt,
2207                                        "unsupported grouped store\n");
2208       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2209         {
2210           vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
2211           vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2212           bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2213           size = DR_GROUP_SIZE (vinfo);
2214           vectype = STMT_VINFO_VECTYPE (vinfo);
2215           if (! vect_load_lanes_supported (vectype, size, false)
2216               && ! vect_grouped_load_supported (vectype, single_element_p,
2217                                                 size))
2218             return opt_result::failure_at (vinfo->stmt,
2219                                            "unsupported grouped load\n");
2220         }
2221     }
2222
2223   if (dump_enabled_p ())
2224     dump_printf_loc (MSG_NOTE, vect_location,
2225                      "re-trying with SLP disabled\n");
2226
2227   /* Roll back state appropriately.  No SLP this time.  */
2228   slp = false;
2229   /* Restore vectorization factor as it were without SLP.  */
2230   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2231   /* Free the SLP instances.  */
2232   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2233     vect_free_slp_instance (instance, false);
2234   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2235   /* Reset SLP type to loop_vect on all stmts.  */
2236   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2237     {
2238       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2239       for (gimple_stmt_iterator si = gsi_start_phis (bb);
2240            !gsi_end_p (si); gsi_next (&si))
2241         {
2242           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2243           STMT_SLP_TYPE (stmt_info) = loop_vect;
2244         }
2245       for (gimple_stmt_iterator si = gsi_start_bb (bb);
2246            !gsi_end_p (si); gsi_next (&si))
2247         {
2248           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2249           STMT_SLP_TYPE (stmt_info) = loop_vect;
2250           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2251             {
2252               gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2253               stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
2254               STMT_SLP_TYPE (stmt_info) = loop_vect;
2255               for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2256                    !gsi_end_p (pi); gsi_next (&pi))
2257                 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2258                   = loop_vect;
2259             }
2260         }
2261     }
2262   /* Free optimized alias test DDRS.  */
2263   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2264   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2265   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2266   /* Reset target cost data.  */
2267   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2268   LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2269     = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2270   /* Reset accumulated rgroup information.  */
2271   release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo));
2272   /* Reset assorted flags.  */
2273   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2274   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2275   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2276   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2277   LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p;
2278
2279   goto start_over;
2280 }
2281
2282 /* Function vect_analyze_loop.
2283
2284    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2285    for it.  The different analyses will record information in the
2286    loop_vec_info struct.  If ORIG_LOOP_VINFO is not NULL epilogue must
2287    be vectorized.  */
2288 opt_loop_vec_info
2289 vect_analyze_loop (class loop *loop, loop_vec_info orig_loop_vinfo,
2290                    vec_info_shared *shared)
2291 {
2292   auto_vector_sizes vector_sizes;
2293
2294   /* Autodetect first vector size we try.  */
2295   current_vector_size = 0;
2296   targetm.vectorize.autovectorize_vector_sizes (&vector_sizes,
2297                                                 loop->simdlen != 0);
2298   unsigned int next_size = 0;
2299
2300   DUMP_VECT_SCOPE ("analyze_loop_nest");
2301
2302   if (loop_outer (loop)
2303       && loop_vec_info_for_loop (loop_outer (loop))
2304       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2305     return opt_loop_vec_info::failure_at (vect_location,
2306                                           "outer-loop already vectorized.\n");
2307
2308   if (!find_loop_nest (loop, &shared->loop_nest))
2309     return opt_loop_vec_info::failure_at
2310       (vect_location,
2311        "not vectorized: loop nest containing two or more consecutive inner"
2312        " loops cannot be vectorized\n");
2313
2314   unsigned n_stmts = 0;
2315   poly_uint64 autodetected_vector_size = 0;
2316   opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
2317   poly_uint64 first_vector_size = 0;
2318   while (1)
2319     {
2320       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
2321       opt_loop_vec_info loop_vinfo
2322         = vect_analyze_loop_form (loop, shared);
2323       if (!loop_vinfo)
2324         {
2325           if (dump_enabled_p ())
2326             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2327                              "bad loop form.\n");
2328           gcc_checking_assert (first_loop_vinfo == NULL);
2329           return loop_vinfo;
2330         }
2331
2332       bool fatal = false;
2333
2334       if (orig_loop_vinfo)
2335         LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo;
2336
2337       opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts);
2338       if (res)
2339         {
2340           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2341
2342           if (loop->simdlen
2343               && maybe_ne (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2344                            (unsigned HOST_WIDE_INT) loop->simdlen))
2345             {
2346               if (first_loop_vinfo == NULL)
2347                 {
2348                   first_loop_vinfo = loop_vinfo;
2349                   first_vector_size = current_vector_size;
2350                   loop->aux = NULL;
2351                 }
2352               else
2353                 delete loop_vinfo;
2354             }
2355           else
2356             {
2357               delete first_loop_vinfo;
2358               return loop_vinfo;
2359             }
2360         }
2361       else
2362         delete loop_vinfo;
2363
2364       if (next_size == 0)
2365         autodetected_vector_size = current_vector_size;
2366
2367       if (next_size < vector_sizes.length ()
2368           && known_eq (vector_sizes[next_size], autodetected_vector_size))
2369         next_size += 1;
2370
2371       if (fatal)
2372         {
2373           gcc_checking_assert (first_loop_vinfo == NULL);
2374           return opt_loop_vec_info::propagate_failure (res);
2375         }
2376
2377       if (next_size == vector_sizes.length ()
2378           || known_eq (current_vector_size, 0U))
2379         {
2380           if (first_loop_vinfo)
2381             {
2382               current_vector_size = first_vector_size;
2383               loop->aux = (loop_vec_info) first_loop_vinfo;
2384               if (dump_enabled_p ())
2385                 {
2386                   dump_printf_loc (MSG_NOTE, vect_location,
2387                                    "***** Choosing vector size ");
2388                   dump_dec (MSG_NOTE, current_vector_size);
2389                   dump_printf (MSG_NOTE, "\n");
2390                 }
2391               return first_loop_vinfo;
2392             }
2393           else
2394             return opt_loop_vec_info::propagate_failure (res);
2395         }
2396
2397       /* Try the next biggest vector size.  */
2398       current_vector_size = vector_sizes[next_size++];
2399       if (dump_enabled_p ())
2400         {
2401           dump_printf_loc (MSG_NOTE, vect_location,
2402                            "***** Re-trying analysis with "
2403                            "vector size ");
2404           dump_dec (MSG_NOTE, current_vector_size);
2405           dump_printf (MSG_NOTE, "\n");
2406         }
2407     }
2408 }
2409
2410 /* Return true if there is an in-order reduction function for CODE, storing
2411    it in *REDUC_FN if so.  */
2412
2413 static bool
2414 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
2415 {
2416   switch (code)
2417     {
2418     case PLUS_EXPR:
2419       *reduc_fn = IFN_FOLD_LEFT_PLUS;
2420       return true;
2421
2422     default:
2423       return false;
2424     }
2425 }
2426
2427 /* Function reduction_fn_for_scalar_code
2428
2429    Input:
2430    CODE - tree_code of a reduction operations.
2431
2432    Output:
2433    REDUC_FN - the corresponding internal function to be used to reduce the
2434       vector of partial results into a single scalar result, or IFN_LAST
2435       if the operation is a supported reduction operation, but does not have
2436       such an internal function.
2437
2438    Return FALSE if CODE currently cannot be vectorized as reduction.  */
2439
2440 static bool
2441 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
2442 {
2443   switch (code)
2444     {
2445       case MAX_EXPR:
2446         *reduc_fn = IFN_REDUC_MAX;
2447         return true;
2448
2449       case MIN_EXPR:
2450         *reduc_fn = IFN_REDUC_MIN;
2451         return true;
2452
2453       case PLUS_EXPR:
2454         *reduc_fn = IFN_REDUC_PLUS;
2455         return true;
2456
2457       case BIT_AND_EXPR:
2458         *reduc_fn = IFN_REDUC_AND;
2459         return true;
2460
2461       case BIT_IOR_EXPR:
2462         *reduc_fn = IFN_REDUC_IOR;
2463         return true;
2464
2465       case BIT_XOR_EXPR:
2466         *reduc_fn = IFN_REDUC_XOR;
2467         return true;
2468
2469       case MULT_EXPR:
2470       case MINUS_EXPR:
2471         *reduc_fn = IFN_LAST;
2472         return true;
2473
2474       default:
2475        return false;
2476     }
2477 }
2478
2479 /* If there is a neutral value X such that SLP reduction NODE would not
2480    be affected by the introduction of additional X elements, return that X,
2481    otherwise return null.  CODE is the code of the reduction.  REDUC_CHAIN
2482    is true if the SLP statements perform a single reduction, false if each
2483    statement performs an independent reduction.  */
2484
2485 static tree
2486 neutral_op_for_slp_reduction (slp_tree slp_node, tree_code code,
2487                               bool reduc_chain)
2488 {
2489   vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2490   stmt_vec_info stmt_vinfo = stmts[0];
2491   tree vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
2492   tree scalar_type = TREE_TYPE (vector_type);
2493   class loop *loop = gimple_bb (stmt_vinfo->stmt)->loop_father;
2494   gcc_assert (loop);
2495
2496   switch (code)
2497     {
2498     case WIDEN_SUM_EXPR:
2499     case DOT_PROD_EXPR:
2500     case SAD_EXPR:
2501     case PLUS_EXPR:
2502     case MINUS_EXPR:
2503     case BIT_IOR_EXPR:
2504     case BIT_XOR_EXPR:
2505       return build_zero_cst (scalar_type);
2506
2507     case MULT_EXPR:
2508       return build_one_cst (scalar_type);
2509
2510     case BIT_AND_EXPR:
2511       return build_all_ones_cst (scalar_type);
2512
2513     case MAX_EXPR:
2514     case MIN_EXPR:
2515       /* For MIN/MAX the initial values are neutral.  A reduction chain
2516          has only a single initial value, so that value is neutral for
2517          all statements.  */
2518       if (reduc_chain)
2519         return PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt,
2520                                       loop_preheader_edge (loop));
2521       return NULL_TREE;
2522
2523     default:
2524       return NULL_TREE;
2525     }
2526 }
2527
2528 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
2529    STMT is printed with a message MSG. */
2530
2531 static void
2532 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2533 {
2534   dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
2535 }
2536
2537 /* DEF_STMT_INFO occurs in a loop that contains a potential reduction
2538    operation.  Return true if the results of DEF_STMT_INFO are something
2539    that can be accumulated by such a reduction.  */
2540
2541 static bool
2542 vect_valid_reduction_input_p (stmt_vec_info def_stmt_info)
2543 {
2544   return (is_gimple_assign (def_stmt_info->stmt)
2545           || is_gimple_call (def_stmt_info->stmt)
2546           || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_induction_def
2547           || (gimple_code (def_stmt_info->stmt) == GIMPLE_PHI
2548               && STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_internal_def
2549               && !is_loop_header_bb_p (gimple_bb (def_stmt_info->stmt))));
2550 }
2551
2552 /* Return true if we need an in-order reduction for operation CODE
2553    on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
2554    overflow must wrap.  */
2555
2556 static bool
2557 needs_fold_left_reduction_p (tree type, tree_code code,
2558                              bool need_wrapping_integral_overflow)
2559 {
2560   /* CHECKME: check for !flag_finite_math_only too?  */
2561   if (SCALAR_FLOAT_TYPE_P (type))
2562     switch (code)
2563       {
2564       case MIN_EXPR:
2565       case MAX_EXPR:
2566         return false;
2567
2568       default:
2569         return !flag_associative_math;
2570       }
2571
2572   if (INTEGRAL_TYPE_P (type))
2573     {
2574       if (!operation_no_trapping_overflow (type, code))
2575         return true;
2576       if (need_wrapping_integral_overflow
2577           && !TYPE_OVERFLOW_WRAPS (type)
2578           && operation_can_overflow (code))
2579         return true;
2580       return false;
2581     }
2582
2583   if (SAT_FIXED_POINT_TYPE_P (type))
2584     return true;
2585
2586   return false;
2587 }
2588
2589 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
2590    reduction operation CODE has a handled computation expression.  */
2591
2592 static bool
2593 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
2594                       tree loop_arg, enum tree_code code,
2595                       vec<std::pair<ssa_op_iter, use_operand_p> > &path)
2596 {
2597   auto_bitmap visited;
2598   tree lookfor = PHI_RESULT (phi);
2599   ssa_op_iter curri;
2600   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
2601   while (USE_FROM_PTR (curr) != loop_arg)
2602     curr = op_iter_next_use (&curri);
2603   curri.i = curri.numops;
2604   do
2605     {
2606       path.safe_push (std::make_pair (curri, curr));
2607       tree use = USE_FROM_PTR (curr);
2608       if (use == lookfor)
2609         break;
2610       gimple *def = SSA_NAME_DEF_STMT (use);
2611       if (gimple_nop_p (def)
2612           || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
2613         {
2614 pop:
2615           do
2616             {
2617               std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
2618               curri = x.first;
2619               curr = x.second;
2620               do
2621                 curr = op_iter_next_use (&curri);
2622               /* Skip already visited or non-SSA operands (from iterating
2623                  over PHI args).  */
2624               while (curr != NULL_USE_OPERAND_P
2625                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2626                          || ! bitmap_set_bit (visited,
2627                                               SSA_NAME_VERSION
2628                                                 (USE_FROM_PTR (curr)))));
2629             }
2630           while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
2631           if (curr == NULL_USE_OPERAND_P)
2632             break;
2633         }
2634       else
2635         {
2636           if (gimple_code (def) == GIMPLE_PHI)
2637             curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
2638           else
2639             curr = op_iter_init_use (&curri, def, SSA_OP_USE);
2640           while (curr != NULL_USE_OPERAND_P
2641                  && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2642                      || ! bitmap_set_bit (visited,
2643                                           SSA_NAME_VERSION
2644                                             (USE_FROM_PTR (curr)))))
2645             curr = op_iter_next_use (&curri);
2646           if (curr == NULL_USE_OPERAND_P)
2647             goto pop;
2648         }
2649     }
2650   while (1);
2651   if (dump_file && (dump_flags & TDF_DETAILS))
2652     {
2653       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
2654       unsigned i;
2655       std::pair<ssa_op_iter, use_operand_p> *x;
2656       FOR_EACH_VEC_ELT (path, i, x)
2657         dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
2658       dump_printf (MSG_NOTE, "\n");
2659     }
2660
2661   /* Check whether the reduction path detected is valid.  */
2662   bool fail = path.length () == 0;
2663   bool neg = false;
2664   for (unsigned i = 1; i < path.length (); ++i)
2665     {
2666       gimple *use_stmt = USE_STMT (path[i].second);
2667       tree op = USE_FROM_PTR (path[i].second);
2668       if (! has_single_use (op)
2669           || ! is_gimple_assign (use_stmt)
2670           /* The following make sure we can compute the operand index
2671              easily plus it mostly disallows chaining via COND_EXPR condition
2672              operands.  */
2673           || (gimple_assign_rhs1 (use_stmt) != op
2674               && gimple_assign_rhs2 (use_stmt) != op
2675               && gimple_assign_rhs3 (use_stmt) != op))
2676         {
2677           fail = true;
2678           break;
2679         }
2680       if (gimple_assign_rhs_code (use_stmt) != code)
2681         {
2682           if (code == PLUS_EXPR
2683               && gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
2684             {
2685               /* Track whether we negate the reduction value each iteration.  */
2686               if (gimple_assign_rhs2 (use_stmt) == op)
2687                 neg = ! neg;
2688             }
2689           else
2690             {
2691               fail = true;
2692               break;
2693             }
2694         }
2695     }
2696   return ! fail && ! neg;
2697 }
2698
2699 bool
2700 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
2701                       tree loop_arg, enum tree_code code)
2702 {
2703   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
2704   return check_reduction_path (loc, loop, phi, loop_arg, code, path);
2705 }
2706
2707
2708
2709 /* Function vect_is_simple_reduction
2710
2711    (1) Detect a cross-iteration def-use cycle that represents a simple
2712    reduction computation.  We look for the following pattern:
2713
2714    loop_header:
2715      a1 = phi < a0, a2 >
2716      a3 = ...
2717      a2 = operation (a3, a1)
2718
2719    or
2720
2721    a3 = ...
2722    loop_header:
2723      a1 = phi < a0, a2 >
2724      a2 = operation (a3, a1)
2725
2726    such that:
2727    1. operation is commutative and associative and it is safe to
2728       change the order of the computation
2729    2. no uses for a2 in the loop (a2 is used out of the loop)
2730    3. no uses of a1 in the loop besides the reduction operation
2731    4. no uses of a1 outside the loop.
2732
2733    Conditions 1,4 are tested here.
2734    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2735
2736    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2737    nested cycles.
2738
2739    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2740    reductions:
2741
2742      a1 = phi < a0, a2 >
2743      inner loop (def of a3)
2744      a2 = phi < a3 >
2745
2746    (4) Detect condition expressions, ie:
2747      for (int i = 0; i < N; i++)
2748        if (a[i] < val)
2749         ret_val = a[i];
2750
2751 */
2752
2753 static stmt_vec_info
2754 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
2755                           bool *double_reduc,
2756                           bool need_wrapping_integral_overflow,
2757                           enum vect_reduction_type *v_reduc_type)
2758 {
2759   gphi *phi = as_a <gphi *> (phi_info->stmt);
2760   class loop *loop = (gimple_bb (phi))->loop_father;
2761   class loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2762   bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
2763   gimple *phi_use_stmt = NULL;
2764   enum tree_code orig_code, code;
2765   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2766   tree type;
2767   tree name;
2768   imm_use_iterator imm_iter;
2769   use_operand_p use_p;
2770   bool phi_def;
2771
2772   *double_reduc = false;
2773   *v_reduc_type = TREE_CODE_REDUCTION;
2774
2775   tree phi_name = PHI_RESULT (phi);
2776   /* ???  If there are no uses of the PHI result the inner loop reduction
2777      won't be detected as possibly double-reduction by vectorizable_reduction
2778      because that tries to walk the PHI arg from the preheader edge which
2779      can be constant.  See PR60382.  */
2780   if (has_zero_uses (phi_name))
2781     return NULL;
2782   unsigned nphi_def_loop_uses = 0;
2783   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
2784     {
2785       gimple *use_stmt = USE_STMT (use_p);
2786       if (is_gimple_debug (use_stmt))
2787         continue;
2788
2789       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2790         {
2791           if (dump_enabled_p ())
2792             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2793                              "intermediate value used outside loop.\n");
2794
2795           return NULL;
2796         }
2797
2798       nphi_def_loop_uses++;
2799       phi_use_stmt = use_stmt;
2800     }
2801
2802   edge latch_e = loop_latch_edge (loop);
2803   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2804   if (TREE_CODE (loop_arg) != SSA_NAME)
2805     {
2806       if (dump_enabled_p ())
2807         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2808                          "reduction: not ssa_name: %T\n", loop_arg);
2809       return NULL;
2810     }
2811
2812   stmt_vec_info def_stmt_info = loop_info->lookup_def (loop_arg);
2813   if (!def_stmt_info
2814       || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
2815     return NULL;
2816
2817   if (gassign *def_stmt = dyn_cast <gassign *> (def_stmt_info->stmt))
2818     {
2819       name = gimple_assign_lhs (def_stmt);
2820       phi_def = false;
2821     }
2822   else if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
2823     {
2824       name = PHI_RESULT (def_stmt);
2825       phi_def = true;
2826     }
2827   else
2828     {
2829       if (dump_enabled_p ())
2830         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2831                          "reduction: unhandled reduction operation: %G",
2832                          def_stmt_info->stmt);
2833       return NULL;
2834     }
2835
2836   unsigned nlatch_def_loop_uses = 0;
2837   auto_vec<gphi *, 3> lcphis;
2838   bool inner_loop_of_double_reduc = false;
2839   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2840     {
2841       gimple *use_stmt = USE_STMT (use_p);
2842       if (is_gimple_debug (use_stmt))
2843         continue;
2844       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2845         nlatch_def_loop_uses++;
2846       else
2847         {
2848           /* We can have more than one loop-closed PHI.  */
2849           lcphis.safe_push (as_a <gphi *> (use_stmt));
2850           if (nested_in_vect_loop
2851               && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
2852                   == vect_double_reduction_def))
2853             inner_loop_of_double_reduc = true;
2854         }
2855     }
2856
2857   /* If this isn't a nested cycle or if the nested cycle reduction value
2858      is used ouside of the inner loop we cannot handle uses of the reduction
2859      value.  */
2860   if ((!nested_in_vect_loop || inner_loop_of_double_reduc)
2861       && (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1))
2862     {
2863       if (dump_enabled_p ())
2864         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2865                          "reduction used in loop.\n");
2866       return NULL;
2867     }
2868
2869   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2870      defined in the inner loop.  */
2871   if (phi_def)
2872     {
2873       gphi *def_stmt = as_a <gphi *> (def_stmt_info->stmt);
2874       op1 = PHI_ARG_DEF (def_stmt, 0);
2875
2876       if (gimple_phi_num_args (def_stmt) != 1
2877           || TREE_CODE (op1) != SSA_NAME)
2878         {
2879           if (dump_enabled_p ())
2880             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2881                              "unsupported phi node definition.\n");
2882
2883           return NULL;
2884         }
2885
2886       gimple *def1 = SSA_NAME_DEF_STMT (op1);
2887       if (gimple_bb (def1)
2888           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2889           && loop->inner
2890           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2891           && is_gimple_assign (def1)
2892           && is_a <gphi *> (phi_use_stmt)
2893           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
2894         {
2895           if (dump_enabled_p ())
2896             report_vect_op (MSG_NOTE, def_stmt,
2897                             "detected double reduction: ");
2898
2899           *double_reduc = true;
2900           return def_stmt_info;
2901         }
2902
2903       return NULL;
2904     }
2905
2906   /* If we are vectorizing an inner reduction we are executing that
2907      in the original order only in case we are not dealing with a
2908      double reduction.  */
2909   bool check_reduction = true;
2910   if (flow_loop_nested_p (vect_loop, loop))
2911     {
2912       gphi *lcphi;
2913       unsigned i;
2914       check_reduction = false;
2915       FOR_EACH_VEC_ELT (lcphis, i, lcphi)
2916         FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result (lcphi))
2917           {
2918             gimple *use_stmt = USE_STMT (use_p);
2919             if (is_gimple_debug (use_stmt))
2920               continue;
2921             if (! flow_bb_inside_loop_p (vect_loop, gimple_bb (use_stmt)))
2922               check_reduction = true;
2923           }
2924     }
2925
2926   gassign *def_stmt = as_a <gassign *> (def_stmt_info->stmt);
2927   code = orig_code = gimple_assign_rhs_code (def_stmt);
2928
2929   if (nested_in_vect_loop && !check_reduction)
2930     {
2931       /* FIXME: Even for non-reductions code generation is funneled
2932          through vectorizable_reduction for the stmt defining the
2933          PHI latch value.  So we have to artificially restrict ourselves
2934          for the supported operations.  */
2935       switch (get_gimple_rhs_class (code))
2936         {
2937         case GIMPLE_BINARY_RHS:
2938         case GIMPLE_TERNARY_RHS:
2939           break;
2940         default:
2941           /* Not supported by vectorizable_reduction.  */
2942           if (dump_enabled_p ())
2943             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2944                             "nested cycle: not handled operation: ");
2945           return NULL;
2946         }
2947       if (dump_enabled_p ())
2948         report_vect_op (MSG_NOTE, def_stmt, "detected nested cycle: ");
2949       return def_stmt_info;
2950     }
2951
2952   /* We can handle "res -= x[i]", which is non-associative by
2953      simply rewriting this into "res += -x[i]".  Avoid changing
2954      gimple instruction for the first simple tests and only do this
2955      if we're allowed to change code at all.  */
2956   if (code == MINUS_EXPR && gimple_assign_rhs2 (def_stmt) != phi_name)
2957     code = PLUS_EXPR;
2958
2959   if (code == COND_EXPR)
2960     {
2961       if (! nested_in_vect_loop)
2962         *v_reduc_type = COND_REDUCTION;
2963
2964       op3 = gimple_assign_rhs1 (def_stmt);
2965       if (COMPARISON_CLASS_P (op3))
2966         {
2967           op4 = TREE_OPERAND (op3, 1);
2968           op3 = TREE_OPERAND (op3, 0);
2969         }
2970       if (op3 == phi_name || op4 == phi_name)
2971         {
2972           if (dump_enabled_p ())
2973             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2974                             "reduction: condition depends on previous"
2975                             " iteration: ");
2976           return NULL;
2977         }
2978
2979       op1 = gimple_assign_rhs2 (def_stmt);
2980       op2 = gimple_assign_rhs3 (def_stmt);
2981     }
2982   else if (!commutative_tree_code (code) || !associative_tree_code (code))
2983     {
2984       if (dump_enabled_p ())
2985         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2986                         "reduction: not commutative/associative: ");
2987       return NULL;
2988     }
2989   else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS)
2990     {
2991       op1 = gimple_assign_rhs1 (def_stmt);
2992       op2 = gimple_assign_rhs2 (def_stmt);
2993     }
2994   else
2995     {
2996       if (dump_enabled_p ())
2997         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2998                         "reduction: not handled operation: ");
2999       return NULL;
3000     }
3001
3002   if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
3003     {
3004       if (dump_enabled_p ())
3005         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3006                         "reduction: both uses not ssa_names: ");
3007
3008       return NULL;
3009     }
3010
3011   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
3012   if ((TREE_CODE (op1) == SSA_NAME
3013        && !types_compatible_p (type,TREE_TYPE (op1)))
3014       || (TREE_CODE (op2) == SSA_NAME
3015           && !types_compatible_p (type, TREE_TYPE (op2)))
3016       || (op3 && TREE_CODE (op3) == SSA_NAME
3017           && !types_compatible_p (type, TREE_TYPE (op3)))
3018       || (op4 && TREE_CODE (op4) == SSA_NAME
3019           && !types_compatible_p (type, TREE_TYPE (op4))))
3020     {
3021       if (dump_enabled_p ())
3022         {
3023           dump_printf_loc (MSG_NOTE, vect_location,
3024                            "reduction: multiple types: operation type: "
3025                            "%T, operands types: %T,%T",
3026                            type,  TREE_TYPE (op1), TREE_TYPE (op2));
3027           if (op3)
3028             dump_printf (MSG_NOTE, ",%T", TREE_TYPE (op3));
3029
3030           if (op4)
3031             dump_printf (MSG_NOTE, ",%T", TREE_TYPE (op4));
3032           dump_printf (MSG_NOTE, "\n");
3033         }
3034
3035       return NULL;
3036     }
3037
3038   /* Check whether it's ok to change the order of the computation.
3039      Generally, when vectorizing a reduction we change the order of the
3040      computation.  This may change the behavior of the program in some
3041      cases, so we need to check that this is ok.  One exception is when
3042      vectorizing an outer-loop: the inner-loop is executed sequentially,
3043      and therefore vectorizing reductions in the inner-loop during
3044      outer-loop vectorization is safe.  */
3045   if (check_reduction
3046       && *v_reduc_type == TREE_CODE_REDUCTION
3047       && needs_fold_left_reduction_p (type, code,
3048                                       need_wrapping_integral_overflow))
3049     *v_reduc_type = FOLD_LEFT_REDUCTION;
3050
3051   /* Reduction is safe. We're dealing with one of the following:
3052      1) integer arithmetic and no trapv
3053      2) floating point arithmetic, and special flags permit this optimization
3054      3) nested cycle (i.e., outer loop vectorization).  */
3055   stmt_vec_info def1_info = loop_info->lookup_def (op1);
3056   stmt_vec_info def2_info = loop_info->lookup_def (op2);
3057   if (code != COND_EXPR && !def1_info && !def2_info)
3058     {
3059       if (dump_enabled_p ())
3060         report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
3061       return NULL;
3062     }
3063
3064   /* Check that one def is the reduction def, defined by PHI,
3065      the other def is either defined in the loop ("vect_internal_def"),
3066      or it's an induction (defined by a loop-header phi-node).  */
3067
3068   if (def2_info
3069       && def2_info->stmt == phi
3070       && (code == COND_EXPR
3071           || !def1_info
3072           || !flow_bb_inside_loop_p (loop, gimple_bb (def1_info->stmt))
3073           || vect_valid_reduction_input_p (def1_info)))
3074     {
3075       STMT_VINFO_REDUC_IDX (def_stmt_info) = 1 + (code == COND_EXPR ? 1 : 0);
3076       if (dump_enabled_p ())
3077         report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3078       return def_stmt_info;
3079     }
3080
3081   if (def1_info
3082       && def1_info->stmt == phi
3083       && (code == COND_EXPR
3084           || !def2_info
3085           || !flow_bb_inside_loop_p (loop, gimple_bb (def2_info->stmt))
3086           || vect_valid_reduction_input_p (def2_info)))
3087     {
3088       STMT_VINFO_REDUC_IDX (def_stmt_info) = 0 + (code == COND_EXPR ? 1 : 0);
3089       if (dump_enabled_p ())
3090         report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3091       return def_stmt_info;
3092     }
3093
3094   /* Look for the expression computing loop_arg from loop PHI result.  */
3095   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3096   if (check_reduction_path (vect_location, loop, phi, loop_arg, code,
3097                             path))
3098     {
3099       /* Try building an SLP reduction chain for which the additional
3100          restriction is that all operations in the chain are the same.  */
3101       auto_vec<stmt_vec_info, 8> reduc_chain;
3102       unsigned i;
3103       bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
3104       for (i = path.length () - 1; i >= 1; --i)
3105         {
3106           gimple *stmt = USE_STMT (path[i].second);
3107           if (gimple_assign_rhs_code (stmt) != code)
3108             is_slp_reduc = false;
3109           stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
3110           STMT_VINFO_REDUC_IDX (stmt_info)
3111             = path[i].second->use - gimple_assign_rhs1_ptr (stmt);
3112           reduc_chain.safe_push (stmt_info);
3113         }
3114       if (is_slp_reduc)
3115         {
3116           for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
3117             {
3118               REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
3119               REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
3120             }
3121           REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
3122           REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
3123
3124           /* Save the chain for further analysis in SLP detection.  */
3125           LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
3126           REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
3127
3128           if (dump_enabled_p ())
3129             report_vect_op (MSG_NOTE, def_stmt,
3130                             "reduction: detected reduction chain: ");
3131         }
3132
3133       return def_stmt_info;
3134     }
3135
3136   if (dump_enabled_p ())
3137     {
3138       report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3139                       "reduction: unknown pattern: ");
3140     }
3141
3142   return NULL;
3143 }
3144
3145 /* Wrapper around vect_is_simple_reduction, which will modify code
3146    in-place if it enables detection of more reductions.  Arguments
3147    as there.  */
3148
3149 static stmt_vec_info
3150 vect_force_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3151                              bool *double_reduc,
3152                              bool need_wrapping_integral_overflow)
3153 {
3154   enum vect_reduction_type v_reduc_type;
3155   stmt_vec_info def_info
3156     = vect_is_simple_reduction (loop_info, phi_info, double_reduc,
3157                                 need_wrapping_integral_overflow,
3158                                 &v_reduc_type);
3159   if (def_info)
3160     {
3161       STMT_VINFO_REDUC_TYPE (phi_info) = v_reduc_type;
3162       STMT_VINFO_REDUC_DEF (phi_info) = def_info;
3163       STMT_VINFO_REDUC_TYPE (def_info) = v_reduc_type;
3164       STMT_VINFO_REDUC_DEF (def_info) = phi_info;
3165     }
3166   return def_info;
3167 }
3168
3169 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
3170 int
3171 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3172                              int *peel_iters_epilogue,
3173                              stmt_vector_for_cost *scalar_cost_vec,
3174                              stmt_vector_for_cost *prologue_cost_vec,
3175                              stmt_vector_for_cost *epilogue_cost_vec)
3176 {
3177   int retval = 0;
3178   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3179
3180   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3181     {
3182       *peel_iters_epilogue = assumed_vf / 2;
3183       if (dump_enabled_p ())
3184         dump_printf_loc (MSG_NOTE, vect_location,
3185                          "cost model: epilogue peel iters set to vf/2 "
3186                          "because loop iterations are unknown .\n");
3187
3188       /* If peeled iterations are known but number of scalar loop
3189          iterations are unknown, count a taken branch per peeled loop.  */
3190       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3191                                  NULL, 0, vect_prologue);
3192       retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
3193                                   NULL, 0, vect_epilogue);
3194     }
3195   else
3196     {
3197       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3198       peel_iters_prologue = niters < peel_iters_prologue ?
3199                             niters : peel_iters_prologue;
3200       *peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3201       /* If we need to peel for gaps, but no peeling is required, we have to
3202          peel VF iterations.  */
3203       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3204         *peel_iters_epilogue = assumed_vf;
3205     }
3206
3207   stmt_info_for_cost *si;
3208   int j;
3209   if (peel_iters_prologue)
3210     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3211       retval += record_stmt_cost (prologue_cost_vec,
3212                                   si->count * peel_iters_prologue,
3213                                   si->kind, si->stmt_info, si->misalign,
3214                                   vect_prologue);
3215   if (*peel_iters_epilogue)
3216     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3217       retval += record_stmt_cost (epilogue_cost_vec,
3218                                   si->count * *peel_iters_epilogue,
3219                                   si->kind, si->stmt_info, si->misalign,
3220                                   vect_epilogue);
3221
3222   return retval;
3223 }
3224
3225 /* Function vect_estimate_min_profitable_iters
3226
3227    Return the number of iterations required for the vector version of the
3228    loop to be profitable relative to the cost of the scalar version of the
3229    loop.
3230
3231    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3232    of iterations for vectorization.  -1 value means loop vectorization
3233    is not profitable.  This returned value may be used for dynamic
3234    profitability check.
3235
3236    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3237    for static check against estimated number of iterations.  */
3238
3239 static void
3240 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3241                                     int *ret_min_profitable_niters,
3242                                     int *ret_min_profitable_estimate)
3243 {
3244   int min_profitable_iters;
3245   int min_profitable_estimate;
3246   int peel_iters_prologue;
3247   int peel_iters_epilogue;
3248   unsigned vec_inside_cost = 0;
3249   int vec_outside_cost = 0;
3250   unsigned vec_prologue_cost = 0;
3251   unsigned vec_epilogue_cost = 0;
3252   int scalar_single_iter_cost = 0;
3253   int scalar_outside_cost = 0;
3254   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3255   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3256   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3257
3258   /* Cost model disabled.  */
3259   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3260     {
3261       if (dump_enabled_p ())
3262         dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3263       *ret_min_profitable_niters = 0;
3264       *ret_min_profitable_estimate = 0;
3265       return;
3266     }
3267
3268   /* Requires loop versioning tests to handle misalignment.  */
3269   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3270     {
3271       /*  FIXME: Make cost depend on complexity of individual check.  */
3272       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3273       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3274                             vect_prologue);
3275       if (dump_enabled_p ())
3276         dump_printf (MSG_NOTE,
3277                      "cost model: Adding cost of checks for loop "
3278                      "versioning to treat misalignment.\n");
3279     }
3280
3281   /* Requires loop versioning with alias checks.  */
3282   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3283     {
3284       /*  FIXME: Make cost depend on complexity of individual check.  */
3285       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3286       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3287                             vect_prologue);
3288       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3289       if (len)
3290         /* Count LEN - 1 ANDs and LEN comparisons.  */
3291         (void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt,
3292                               NULL, 0, vect_prologue);
3293       len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3294       if (len)
3295         {
3296           /* Count LEN - 1 ANDs and LEN comparisons.  */
3297           unsigned int nstmts = len * 2 - 1;
3298           /* +1 for each bias that needs adding.  */
3299           for (unsigned int i = 0; i < len; ++i)
3300             if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3301               nstmts += 1;
3302           (void) add_stmt_cost (target_cost_data, nstmts, scalar_stmt,
3303                                 NULL, 0, vect_prologue);
3304         }
3305       if (dump_enabled_p ())
3306         dump_printf (MSG_NOTE,
3307                      "cost model: Adding cost of checks for loop "
3308                      "versioning aliasing.\n");
3309     }
3310
3311   /* Requires loop versioning with niter checks.  */
3312   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3313     {
3314       /*  FIXME: Make cost depend on complexity of individual check.  */
3315       (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3316                             vect_prologue);
3317       if (dump_enabled_p ())
3318         dump_printf (MSG_NOTE,
3319                      "cost model: Adding cost of checks for loop "
3320                      "versioning niters.\n");
3321     }
3322
3323   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3324     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3325                           vect_prologue);
3326
3327   /* Count statements in scalar loop.  Using this as scalar cost for a single
3328      iteration for now.
3329
3330      TODO: Add outer loop support.
3331
3332      TODO: Consider assigning different costs to different scalar
3333      statements.  */
3334
3335   scalar_single_iter_cost
3336     = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3337
3338   /* Add additional cost for the peeled instructions in prologue and epilogue
3339      loop.  (For fully-masked loops there will be no peeling.)
3340
3341      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3342      at compile-time - we assume it's vf/2 (the worst would be vf-1).
3343
3344      TODO: Build an expression that represents peel_iters for prologue and
3345      epilogue to be used in a run-time test.  */
3346
3347   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3348     {
3349       peel_iters_prologue = 0;
3350       peel_iters_epilogue = 0;
3351
3352       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3353         {
3354           /* We need to peel exactly one iteration.  */
3355           peel_iters_epilogue += 1;
3356           stmt_info_for_cost *si;
3357           int j;
3358           FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
3359                             j, si)
3360             (void) add_stmt_cost (target_cost_data, si->count,
3361                                   si->kind, si->stmt_info, si->misalign,
3362                                   vect_epilogue);
3363         }
3364     }
3365   else if (npeel < 0)
3366     {
3367       peel_iters_prologue = assumed_vf / 2;
3368       if (dump_enabled_p ())
3369         dump_printf (MSG_NOTE, "cost model: "
3370                      "prologue peel iters set to vf/2.\n");
3371
3372       /* If peeling for alignment is unknown, loop bound of main loop becomes
3373          unknown.  */
3374       peel_iters_epilogue = assumed_vf / 2;
3375       if (dump_enabled_p ())
3376         dump_printf (MSG_NOTE, "cost model: "
3377                      "epilogue peel iters set to vf/2 because "
3378                      "peeling for alignment is unknown.\n");
3379
3380       /* If peeled iterations are unknown, count a taken branch and a not taken
3381          branch per peeled loop. Even if scalar loop iterations are known,
3382          vector iterations are not known since peeled prologue iterations are
3383          not known. Hence guards remain the same.  */
3384       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3385                             NULL, 0, vect_prologue);
3386       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3387                             NULL, 0, vect_prologue);
3388       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3389                             NULL, 0, vect_epilogue);
3390       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3391                             NULL, 0, vect_epilogue);
3392       stmt_info_for_cost *si;
3393       int j;
3394       FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3395         {
3396           (void) add_stmt_cost (target_cost_data,
3397                                 si->count * peel_iters_prologue,
3398                                 si->kind, si->stmt_info, si->misalign,
3399                                 vect_prologue);
3400           (void) add_stmt_cost (target_cost_data,
3401                                 si->count * peel_iters_epilogue,
3402                                 si->kind, si->stmt_info, si->misalign,
3403                                 vect_epilogue);
3404         }
3405     }
3406   else
3407     {
3408       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3409       stmt_info_for_cost *si;
3410       int j;
3411       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3412
3413       prologue_cost_vec.create (2);
3414       epilogue_cost_vec.create (2);
3415       peel_iters_prologue = npeel;
3416
3417       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3418                                           &peel_iters_epilogue,
3419                                           &LOOP_VINFO_SCALAR_ITERATION_COST
3420                                             (loop_vinfo),
3421                                           &prologue_cost_vec,
3422                                           &epilogue_cost_vec);
3423
3424       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3425         (void) add_stmt_cost (data, si->count, si->kind, si->stmt_info,
3426                               si->misalign, vect_prologue);
3427
3428       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3429         (void) add_stmt_cost (data, si->count, si->kind, si->stmt_info,
3430                               si->misalign, vect_epilogue);
3431
3432       prologue_cost_vec.release ();
3433       epilogue_cost_vec.release ();
3434     }
3435
3436   /* FORNOW: The scalar outside cost is incremented in one of the
3437      following ways:
3438
3439      1. The vectorizer checks for alignment and aliasing and generates
3440      a condition that allows dynamic vectorization.  A cost model
3441      check is ANDED with the versioning condition.  Hence scalar code
3442      path now has the added cost of the versioning check.
3443
3444        if (cost > th & versioning_check)
3445          jmp to vector code
3446
3447      Hence run-time scalar is incremented by not-taken branch cost.
3448
3449      2. The vectorizer then checks if a prologue is required.  If the
3450      cost model check was not done before during versioning, it has to
3451      be done before the prologue check.
3452
3453        if (cost <= th)
3454          prologue = scalar_iters
3455        if (prologue == 0)
3456          jmp to vector code
3457        else
3458          execute prologue
3459        if (prologue == num_iters)
3460          go to exit
3461
3462      Hence the run-time scalar cost is incremented by a taken branch,
3463      plus a not-taken branch, plus a taken branch cost.
3464
3465      3. The vectorizer then checks if an epilogue is required.  If the
3466      cost model check was not done before during prologue check, it
3467      has to be done with the epilogue check.
3468
3469        if (prologue == 0)
3470          jmp to vector code
3471        else
3472          execute prologue
3473        if (prologue == num_iters)
3474          go to exit
3475        vector code:
3476          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3477            jmp to epilogue
3478
3479      Hence the run-time scalar cost should be incremented by 2 taken
3480      branches.
3481
3482      TODO: The back end may reorder the BBS's differently and reverse
3483      conditions/branch directions.  Change the estimates below to
3484      something more reasonable.  */
3485
3486   /* If the number of iterations is known and we do not do versioning, we can
3487      decide whether to vectorize at compile time.  Hence the scalar version
3488      do not carry cost model guard costs.  */
3489   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3490       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3491     {
3492       /* Cost model check occurs at versioning.  */
3493       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3494         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3495       else
3496         {
3497           /* Cost model check occurs at prologue generation.  */
3498           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3499             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3500               + vect_get_stmt_cost (cond_branch_not_taken);
3501           /* Cost model check occurs at epilogue generation.  */
3502           else
3503             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3504         }
3505     }
3506
3507   /* Complete the target-specific cost calculations.  */
3508   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3509                &vec_inside_cost, &vec_epilogue_cost);
3510
3511   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3512
3513   if (dump_enabled_p ())
3514     {
3515       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3516       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
3517                    vec_inside_cost);
3518       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
3519                    vec_prologue_cost);
3520       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
3521                    vec_epilogue_cost);
3522       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
3523                    scalar_single_iter_cost);
3524       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
3525                    scalar_outside_cost);
3526       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
3527                    vec_outside_cost);
3528       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
3529                    peel_iters_prologue);
3530       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
3531                    peel_iters_epilogue);
3532     }
3533
3534   /* Calculate number of iterations required to make the vector version
3535      profitable, relative to the loop bodies only.  The following condition
3536      must hold true:
3537      SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
3538      where
3539      SIC = scalar iteration cost, VIC = vector iteration cost,
3540      VOC = vector outside cost, VF = vectorization factor,
3541      NPEEL = prologue iterations + epilogue iterations,
3542      SOC = scalar outside cost for run time cost model check.  */
3543
3544   int saving_per_viter = (scalar_single_iter_cost * assumed_vf
3545                           - vec_inside_cost);
3546   if (saving_per_viter <= 0)
3547     {
3548       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3549         warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
3550                     "vectorization did not happen for a simd loop");
3551
3552       if (dump_enabled_p ())
3553         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3554                          "cost model: the vector iteration cost = %d "
3555                          "divided by the scalar iteration cost = %d "
3556                          "is greater or equal to the vectorization factor = %d"
3557                          ".\n",
3558                          vec_inside_cost, scalar_single_iter_cost, assumed_vf);
3559       *ret_min_profitable_niters = -1;
3560       *ret_min_profitable_estimate = -1;
3561       return;
3562     }
3563
3564   /* ??? The "if" arm is written to handle all cases; see below for what
3565      we would do for !LOOP_VINFO_FULLY_MASKED_P.  */
3566   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3567     {
3568       /* Rewriting the condition above in terms of the number of
3569          vector iterations (vniters) rather than the number of
3570          scalar iterations (niters) gives:
3571
3572          SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
3573
3574          <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
3575
3576          For integer N, X and Y when X > 0:
3577
3578          N * X > Y <==> N >= (Y /[floor] X) + 1.  */
3579       int outside_overhead = (vec_outside_cost
3580                               - scalar_single_iter_cost * peel_iters_prologue
3581                               - scalar_single_iter_cost * peel_iters_epilogue
3582                               - scalar_outside_cost);
3583       /* We're only interested in cases that require at least one
3584          vector iteration.  */
3585       int min_vec_niters = 1;
3586       if (outside_overhead > 0)
3587         min_vec_niters = outside_overhead / saving_per_viter + 1;
3588
3589       if (dump_enabled_p ())
3590         dump_printf (MSG_NOTE, "  Minimum number of vector iterations: %d\n",
3591                      min_vec_niters);
3592
3593       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3594         {
3595           /* Now that we know the minimum number of vector iterations,
3596              find the minimum niters for which the scalar cost is larger:
3597
3598              SIC * niters > VIC * vniters + VOC - SOC
3599
3600              We know that the minimum niters is no more than
3601              vniters * VF + NPEEL, but it might be (and often is) less
3602              than that if a partial vector iteration is cheaper than the
3603              equivalent scalar code.  */
3604           int threshold = (vec_inside_cost * min_vec_niters
3605                            + vec_outside_cost
3606                            - scalar_outside_cost);
3607           if (threshold <= 0)
3608             min_profitable_iters = 1;
3609           else
3610             min_profitable_iters = threshold / scalar_single_iter_cost + 1;
3611         }
3612       else
3613         /* Convert the number of vector iterations into a number of
3614            scalar iterations.  */
3615         min_profitable_iters = (min_vec_niters * assumed_vf
3616                                 + peel_iters_prologue
3617                                 + peel_iters_epilogue);
3618     }
3619   else
3620     {
3621       min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
3622                               * assumed_vf
3623                               - vec_inside_cost * peel_iters_prologue
3624                               - vec_inside_cost * peel_iters_epilogue);
3625       if (min_profitable_iters <= 0)
3626         min_profitable_iters = 0;
3627       else
3628         {
3629           min_profitable_iters /= saving_per_viter;
3630
3631           if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
3632               <= (((int) vec_inside_cost * min_profitable_iters)
3633                   + (((int) vec_outside_cost - scalar_outside_cost)
3634                      * assumed_vf)))
3635             min_profitable_iters++;
3636         }
3637     }
3638
3639   if (dump_enabled_p ())
3640     dump_printf (MSG_NOTE,
3641                  "  Calculated minimum iters for profitability: %d\n",
3642                  min_profitable_iters);
3643
3644   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
3645       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
3646     /* We want the vectorized loop to execute at least once.  */
3647     min_profitable_iters = assumed_vf + peel_iters_prologue;
3648
3649   if (dump_enabled_p ())
3650     dump_printf_loc (MSG_NOTE, vect_location,
3651                      "  Runtime profitability threshold = %d\n",
3652                      min_profitable_iters);
3653
3654   *ret_min_profitable_niters = min_profitable_iters;
3655
3656   /* Calculate number of iterations required to make the vector version
3657      profitable, relative to the loop bodies only.
3658
3659      Non-vectorized variant is SIC * niters and it must win over vector
3660      variant on the expected loop trip count.  The following condition must hold true:
3661      SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC  */
3662
3663   if (vec_outside_cost <= 0)
3664     min_profitable_estimate = 0;
3665   else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3666     {
3667       /* This is a repeat of the code above, but with + SOC rather
3668          than - SOC.  */
3669       int outside_overhead = (vec_outside_cost
3670                               - scalar_single_iter_cost * peel_iters_prologue
3671                               - scalar_single_iter_cost * peel_iters_epilogue
3672                               + scalar_outside_cost);
3673       int min_vec_niters = 1;
3674       if (outside_overhead > 0)
3675         min_vec_niters = outside_overhead / saving_per_viter + 1;
3676
3677       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3678         {
3679           int threshold = (vec_inside_cost * min_vec_niters
3680                            + vec_outside_cost
3681                            + scalar_outside_cost);
3682           min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
3683         }
3684       else
3685         min_profitable_estimate = (min_vec_niters * assumed_vf
3686                                    + peel_iters_prologue
3687                                    + peel_iters_epilogue);
3688     }
3689   else
3690     {
3691       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
3692                                  * assumed_vf
3693                                  - vec_inside_cost * peel_iters_prologue
3694                                  - vec_inside_cost * peel_iters_epilogue)
3695                                  / ((scalar_single_iter_cost * assumed_vf)
3696                                    - vec_inside_cost);
3697     }
3698   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3699   if (dump_enabled_p ())
3700     dump_printf_loc (MSG_NOTE, vect_location,
3701                      "  Static estimate profitability threshold = %d\n",
3702                      min_profitable_estimate);
3703
3704   *ret_min_profitable_estimate = min_profitable_estimate;
3705 }
3706
3707 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3708    vector elements (not bits) for a vector with NELT elements.  */
3709 static void
3710 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
3711                               vec_perm_builder *sel)
3712 {
3713   /* The encoding is a single stepped pattern.  Any wrap-around is handled
3714      by vec_perm_indices.  */
3715   sel->new_vector (nelt, 1, 3);
3716   for (unsigned int i = 0; i < 3; i++)
3717     sel->quick_push (i + offset);
3718 }
3719
3720 /* Checks whether the target supports whole-vector shifts for vectors of mode
3721    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
3722    it supports vec_perm_const with masks for all necessary shift amounts.  */
3723 static bool
3724 have_whole_vector_shift (machine_mode mode)
3725 {
3726   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3727     return true;
3728
3729   /* Variable-length vectors should be handled via the optab.  */
3730   unsigned int nelt;
3731   if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
3732     return false;
3733
3734   vec_perm_builder sel;
3735   vec_perm_indices indices;
3736   for (unsigned int i = nelt / 2; i >= 1; i /= 2)
3737     {
3738       calc_vec_perm_mask_for_shift (i, nelt, &sel);
3739       indices.new_vector (sel, 2, nelt);
3740       if (!can_vec_perm_const_p (mode, indices, false))
3741         return false;
3742     }
3743   return true;
3744 }
3745
3746 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3747    functions. Design better to avoid maintenance issues.  */
3748
3749 /* Function vect_model_reduction_cost.
3750
3751    Models cost for a reduction operation, including the vector ops
3752    generated within the strip-mine loop, the initial definition before
3753    the loop, and the epilogue code that must be generated.  */
3754
3755 static void
3756 vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
3757                            int ncopies, stmt_vector_for_cost *cost_vec)
3758 {
3759   int prologue_cost = 0, epilogue_cost = 0, inside_cost;
3760   enum tree_code code;
3761   optab optab;
3762   tree vectype;
3763   machine_mode mode;
3764   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3765   class loop *loop = NULL;
3766
3767   if (loop_vinfo)
3768     loop = LOOP_VINFO_LOOP (loop_vinfo);
3769
3770   /* Condition reductions generate two reductions in the loop.  */
3771   vect_reduction_type reduction_type
3772     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
3773   if (reduction_type == COND_REDUCTION)
3774     ncopies *= 2;
3775
3776   vectype = STMT_VINFO_VECTYPE (stmt_info);
3777   mode = TYPE_MODE (vectype);
3778   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
3779
3780   code = gimple_assign_rhs_code (orig_stmt_info->stmt);
3781
3782   if (reduction_type == EXTRACT_LAST_REDUCTION
3783       || reduction_type == FOLD_LEFT_REDUCTION)
3784     {
3785       /* No extra instructions needed in the prologue.  */
3786       prologue_cost = 0;
3787
3788       if (reduction_type == EXTRACT_LAST_REDUCTION || reduc_fn != IFN_LAST)
3789         /* Count one reduction-like operation per vector.  */
3790         inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
3791                                         stmt_info, 0, vect_body);
3792       else
3793         {
3794           /* Use NELEMENTS extracts and NELEMENTS scalar ops.  */
3795           unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
3796           inside_cost = record_stmt_cost (cost_vec, nelements,
3797                                           vec_to_scalar, stmt_info, 0,
3798                                           vect_body);
3799           inside_cost += record_stmt_cost (cost_vec, nelements,
3800                                            scalar_stmt, stmt_info, 0,
3801                                            vect_body);
3802         }
3803     }
3804   else
3805     {
3806       /* Add in cost for initial definition.
3807          For cond reduction we have four vectors: initial index, step,
3808          initial result of the data reduction, initial value of the index
3809          reduction.  */
3810       int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
3811       prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
3812                                          scalar_to_vec, stmt_info, 0,
3813                                          vect_prologue);
3814
3815       /* Cost of reduction op inside loop.  */
3816       inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
3817                                       stmt_info, 0, vect_body);
3818     }
3819
3820   /* Determine cost of epilogue code.
3821
3822      We have a reduction operator that will reduce the vector in one statement.
3823      Also requires scalar extract.  */
3824
3825   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
3826     {
3827       if (reduc_fn != IFN_LAST)
3828         {
3829           if (reduction_type == COND_REDUCTION)
3830             {
3831               /* An EQ stmt and an COND_EXPR stmt.  */
3832               epilogue_cost += record_stmt_cost (cost_vec, 2,
3833                                                  vector_stmt, stmt_info, 0,
3834                                                  vect_epilogue);
3835               /* Reduction of the max index and a reduction of the found
3836                  values.  */
3837               epilogue_cost += record_stmt_cost (cost_vec, 2,
3838                                                  vec_to_scalar, stmt_info, 0,
3839                                                  vect_epilogue);
3840               /* A broadcast of the max value.  */
3841               epilogue_cost += record_stmt_cost (cost_vec, 1,
3842                                                  scalar_to_vec, stmt_info, 0,
3843                                                  vect_epilogue);
3844             }
3845           else
3846             {
3847               epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
3848                                                  stmt_info, 0, vect_epilogue);
3849               epilogue_cost += record_stmt_cost (cost_vec, 1,
3850                                                  vec_to_scalar, stmt_info, 0,
3851                                                  vect_epilogue);
3852             }
3853         }
3854       else if (reduction_type == COND_REDUCTION)
3855         {
3856           unsigned estimated_nunits = vect_nunits_for_cost (vectype);
3857           /* Extraction of scalar elements.  */
3858           epilogue_cost += record_stmt_cost (cost_vec,
3859                                              2 * estimated_nunits,
3860                                              vec_to_scalar, stmt_info, 0,
3861                                              vect_epilogue);
3862           /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
3863           epilogue_cost += record_stmt_cost (cost_vec,
3864                                              2 * estimated_nunits - 3,
3865                                              scalar_stmt, stmt_info, 0,
3866                                              vect_epilogue);
3867         }
3868       else if (reduction_type == EXTRACT_LAST_REDUCTION
3869                || reduction_type == FOLD_LEFT_REDUCTION)
3870         /* No extra instructions need in the epilogue.  */
3871         ;
3872       else
3873         {
3874           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3875           tree bitsize =
3876             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt_info->stmt)));
3877           int element_bitsize = tree_to_uhwi (bitsize);
3878           int nelements = vec_size_in_bits / element_bitsize;
3879
3880           if (code == COND_EXPR)
3881             code = MAX_EXPR;
3882
3883           optab = optab_for_tree_code (code, vectype, optab_default);
3884
3885           /* We have a whole vector shift available.  */
3886           if (optab != unknown_optab
3887               && VECTOR_MODE_P (mode)
3888               && optab_handler (optab, mode) != CODE_FOR_nothing
3889               && have_whole_vector_shift (mode))
3890             {
3891               /* Final reduction via vector shifts and the reduction operator.
3892                  Also requires scalar extract.  */
3893               epilogue_cost += record_stmt_cost (cost_vec,
3894                                                  exact_log2 (nelements) * 2,
3895                                                  vector_stmt, stmt_info, 0,
3896                                                  vect_epilogue);
3897               epilogue_cost += record_stmt_cost (cost_vec, 1,
3898                                                  vec_to_scalar, stmt_info, 0,
3899                                                  vect_epilogue);
3900             }
3901           else
3902             /* Use extracts and reduction op for final reduction.  For N
3903                elements, we have N extracts and N-1 reduction ops.  */
3904             epilogue_cost += record_stmt_cost (cost_vec,
3905                                                nelements + nelements - 1,
3906                                                vector_stmt, stmt_info, 0,
3907                                                vect_epilogue);
3908         }
3909     }
3910
3911   if (dump_enabled_p ())
3912     dump_printf (MSG_NOTE,
3913                  "vect_model_reduction_cost: inside_cost = %d, "
3914                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
3915                  prologue_cost, epilogue_cost);
3916 }
3917
3918
3919 /* Function vect_model_induction_cost.
3920
3921    Models cost for induction operations.  */
3922
3923 static void
3924 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies,
3925                            stmt_vector_for_cost *cost_vec)
3926 {
3927   unsigned inside_cost, prologue_cost;
3928
3929   if (PURE_SLP_STMT (stmt_info))
3930     return;
3931
3932   /* loop cost for vec_loop.  */
3933   inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
3934                                   stmt_info, 0, vect_body);
3935
3936   /* prologue cost for vec_init and vec_step.  */
3937   prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
3938                                     stmt_info, 0, vect_prologue);
3939
3940   if (dump_enabled_p ())
3941     dump_printf_loc (MSG_NOTE, vect_location,
3942                      "vect_model_induction_cost: inside_cost = %d, "
3943                      "prologue_cost = %d .\n", inside_cost, prologue_cost);
3944 }
3945
3946
3947
3948 /* Function get_initial_def_for_reduction
3949
3950    Input:
3951    STMT_VINFO - a stmt that performs a reduction operation in the loop.
3952    INIT_VAL - the initial value of the reduction variable
3953
3954    Output:
3955    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3956         of the reduction (used for adjusting the epilog - see below).
3957    Return a vector variable, initialized according to the operation that
3958         STMT_VINFO performs. This vector will be used as the initial value
3959         of the vector of partial results.
3960
3961    Option1 (adjust in epilog): Initialize the vector as follows:
3962      add/bit or/xor:    [0,0,...,0,0]
3963      mult/bit and:      [1,1,...,1,1]
3964      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3965    and when necessary (e.g. add/mult case) let the caller know
3966    that it needs to adjust the result by init_val.
3967
3968    Option2: Initialize the vector as follows:
3969      add/bit or/xor:    [init_val,0,0,...,0]
3970      mult/bit and:      [init_val,1,1,...,1]
3971      min/max/cond_expr: [init_val,init_val,...,init_val]
3972    and no adjustments are needed.
3973
3974    For example, for the following code:
3975
3976    s = init_val;
3977    for (i=0;i<n;i++)
3978      s = s + a[i];
3979
3980    STMT_VINFO is 's = s + a[i]', and the reduction variable is 's'.
3981    For a vector of 4 units, we want to return either [0,0,0,init_val],
3982    or [0,0,0,0] and let the caller know that it needs to adjust
3983    the result at the end by 'init_val'.
3984
3985    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3986    initialization vector is simpler (same element in all entries), if
3987    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3988
3989    A cost model should help decide between these two schemes.  */
3990
3991 static tree
3992 get_initial_def_for_reduction (stmt_vec_info stmt_vinfo,
3993                                enum tree_code code, tree init_val,
3994                                tree *adjustment_def)
3995 {
3996   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3997   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3998   tree scalar_type = TREE_TYPE (init_val);
3999   tree vectype = get_vectype_for_scalar_type (scalar_type);
4000   tree def_for_init;
4001   tree init_def;
4002   REAL_VALUE_TYPE real_init_val = dconst0;
4003   int int_init_val = 0;
4004   gimple_seq stmts = NULL;
4005
4006   gcc_assert (vectype);
4007
4008   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4009               || SCALAR_FLOAT_TYPE_P (scalar_type));
4010
4011   gcc_assert (nested_in_vect_loop_p (loop, stmt_vinfo)
4012               || loop == (gimple_bb (stmt_vinfo->stmt))->loop_father);
4013
4014   /* ADJUSTMENT_DEF is NULL when called from
4015      vect_create_epilog_for_reduction to vectorize double reduction.  */
4016   if (adjustment_def)
4017     *adjustment_def = NULL;
4018
4019   switch (code)
4020     {
4021     case WIDEN_SUM_EXPR:
4022     case DOT_PROD_EXPR:
4023     case SAD_EXPR:
4024     case PLUS_EXPR:
4025     case MINUS_EXPR:
4026     case BIT_IOR_EXPR:
4027     case BIT_XOR_EXPR:
4028     case MULT_EXPR:
4029     case BIT_AND_EXPR:
4030       {
4031         if (code == MULT_EXPR)
4032           {
4033             real_init_val = dconst1;
4034             int_init_val = 1;
4035           }
4036
4037         if (code == BIT_AND_EXPR)
4038           int_init_val = -1;
4039
4040         if (SCALAR_FLOAT_TYPE_P (scalar_type))
4041           def_for_init = build_real (scalar_type, real_init_val);
4042         else
4043           def_for_init = build_int_cst (scalar_type, int_init_val);
4044
4045         if (adjustment_def || operand_equal_p (def_for_init, init_val, 0))
4046           {
4047             /* Option1: the first element is '0' or '1' as well.  */
4048             if (!operand_equal_p (def_for_init, init_val, 0))
4049               *adjustment_def = init_val;
4050             init_def = gimple_build_vector_from_val (&stmts, vectype,
4051                                                      def_for_init);
4052           }
4053         else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4054           {
4055             /* Option2 (variable length): the first element is INIT_VAL.  */
4056             init_def = gimple_build_vector_from_val (&stmts, vectype,
4057                                                      def_for_init);
4058             init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4059                                      vectype, init_def, init_val);
4060           }
4061         else
4062           {
4063             /* Option2: the first element is INIT_VAL.  */
4064             tree_vector_builder elts (vectype, 1, 2);
4065             elts.quick_push (init_val);
4066             elts.quick_push (def_for_init);
4067             init_def = gimple_build_vector (&stmts, &elts);
4068           }
4069       }
4070       break;
4071
4072     case MIN_EXPR:
4073     case MAX_EXPR:
4074     case COND_EXPR:
4075       {
4076         init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4077         init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4078       }
4079       break;
4080
4081     default:
4082       gcc_unreachable ();
4083     }
4084
4085   if (stmts)
4086     gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4087   return init_def;
4088 }
4089
4090 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4091    NUMBER_OF_VECTORS is the number of vector defs to create.
4092    If NEUTRAL_OP is nonnull, introducing extra elements of that
4093    value will not change the result.  */
4094
4095 static void
4096 get_initial_defs_for_reduction (slp_tree slp_node,
4097                                 vec<tree> *vec_oprnds,
4098                                 unsigned int number_of_vectors,
4099                                 bool reduc_chain, tree neutral_op)
4100 {
4101   vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4102   stmt_vec_info stmt_vinfo = stmts[0];
4103   unsigned HOST_WIDE_INT nunits;
4104   unsigned j, number_of_places_left_in_vector;
4105   tree vector_type;
4106   unsigned int group_size = stmts.length ();
4107   unsigned int i;
4108   class loop *loop;
4109
4110   vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4111
4112   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4113
4114   loop = (gimple_bb (stmt_vinfo->stmt))->loop_father;
4115   gcc_assert (loop);
4116   edge pe = loop_preheader_edge (loop);
4117
4118   gcc_assert (!reduc_chain || neutral_op);
4119
4120   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4121      created vectors. It is greater than 1 if unrolling is performed.
4122
4123      For example, we have two scalar operands, s1 and s2 (e.g., group of
4124      strided accesses of size two), while NUNITS is four (i.e., four scalars
4125      of this type can be packed in a vector).  The output vector will contain
4126      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
4127      will be 2).
4128
4129      If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4130      vectors containing the operands.
4131
4132      For example, NUNITS is four as before, and the group size is 8
4133      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
4134      {s5, s6, s7, s8}.  */
4135
4136   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4137     nunits = group_size;
4138
4139   number_of_places_left_in_vector = nunits;
4140   bool constant_p = true;
4141   tree_vector_builder elts (vector_type, nunits, 1);
4142   elts.quick_grow (nunits);
4143   gimple_seq ctor_seq = NULL;
4144   for (j = 0; j < nunits * number_of_vectors; ++j)
4145     {
4146       tree op;
4147       i = j % group_size;
4148       stmt_vinfo = stmts[i];
4149
4150       /* Get the def before the loop.  In reduction chain we have only
4151          one initial value.  Else we have as many as PHIs in the group.  */
4152       if (reduc_chain)
4153         op = j != 0 ? neutral_op : PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4154       else if (((vec_oprnds->length () + 1) * nunits
4155                 - number_of_places_left_in_vector >= group_size)
4156                && neutral_op)
4157         op = neutral_op;
4158       else
4159         op = PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4160
4161       /* Create 'vect_ = {op0,op1,...,opn}'.  */
4162       number_of_places_left_in_vector--;
4163       elts[nunits - number_of_places_left_in_vector - 1] = op;
4164       if (!CONSTANT_CLASS_P (op))
4165         constant_p = false;
4166
4167       if (number_of_places_left_in_vector == 0)
4168         {
4169           tree init;
4170           if (constant_p && !neutral_op
4171               ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4172               : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4173             /* Build the vector directly from ELTS.  */
4174             init = gimple_build_vector (&ctor_seq, &elts);
4175           else if (neutral_op)
4176             {
4177               /* Build a vector of the neutral value and shift the
4178                  other elements into place.  */
4179               init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4180                                                    neutral_op);
4181               int k = nunits;
4182               while (k > 0 && elts[k - 1] == neutral_op)
4183                 k -= 1;
4184               while (k > 0)
4185                 {
4186                   k -= 1;
4187                   init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4188                                        vector_type, init, elts[k]);
4189                 }
4190             }
4191           else
4192             {
4193               /* First time round, duplicate ELTS to fill the
4194                  required number of vectors.  */
4195               duplicate_and_interleave (&ctor_seq, vector_type, elts,
4196                                         number_of_vectors, *vec_oprnds);
4197               break;
4198             }
4199           vec_oprnds->quick_push (init);
4200
4201           number_of_places_left_in_vector = nunits;
4202           elts.new_vector (vector_type, nunits, 1);
4203           elts.quick_grow (nunits);
4204           constant_p = true;
4205         }
4206     }
4207   if (ctor_seq != NULL)
4208     gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4209 }
4210
4211
4212 /* Function vect_create_epilog_for_reduction
4213
4214    Create code at the loop-epilog to finalize the result of a reduction
4215    computation.
4216
4217    STMT_INFO is the scalar reduction stmt that is being vectorized.
4218    SLP_NODE is an SLP node containing a group of reduction statements. The
4219      first one in this group is STMT_INFO.
4220    SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
4221    REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
4222      (counting from 0)
4223
4224    This function:
4225    1. Completes the reduction def-use cycles.
4226    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4227       by calling the function specified by REDUC_FN if available, or by
4228       other means (whole-vector shifts or a scalar loop).
4229       The function also creates a new phi node at the loop exit to preserve
4230       loop-closed form, as illustrated below.
4231
4232      The flow at the entry to this function:
4233
4234         loop:
4235           vec_def = phi <vec_init, null>        # REDUCTION_PHI
4236           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
4237           s_loop = scalar_stmt                  # (scalar) STMT_INFO
4238         loop_exit:
4239           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4240           use <s_out0>
4241           use <s_out0>
4242
4243      The above is transformed by this function into:
4244
4245         loop:
4246           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4247           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
4248           s_loop = scalar_stmt                  # (scalar) STMT_INFO
4249         loop_exit:
4250           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4251           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4252           v_out2 = reduce <v_out1>
4253           s_out3 = extract_field <v_out2, 0>
4254           s_out4 = adjust_result <s_out3>
4255           use <s_out4>
4256           use <s_out4>
4257 */
4258
4259 static void
4260 vect_create_epilog_for_reduction (stmt_vec_info stmt_info,
4261                                   slp_tree slp_node,
4262                                   slp_instance slp_node_instance)
4263 {
4264   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4265   /* For double reductions we need to get at the inner loop reduction
4266      stmt which has the meta info attached.  Our stmt_info is that of the
4267      loop-closed PHI of the inner loop which we remember as
4268      def for the reduction PHI generation.  */
4269   bool double_reduc = false;
4270   stmt_vec_info rdef_info = stmt_info;
4271   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
4272     {
4273       gcc_assert (!slp_node);
4274       double_reduc = true;
4275       stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
4276                                             (stmt_info->stmt, 0));
4277       stmt_info = vect_stmt_to_vectorize (stmt_info);
4278     }
4279   gphi *reduc_def_stmt
4280     = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
4281   enum tree_code code = STMT_VINFO_REDUC_CODE (stmt_info);
4282   internal_fn reduc_fn = STMT_VINFO_REDUC_FN (stmt_info);
4283   tree neutral_op = NULL_TREE;
4284   if (slp_node)
4285     neutral_op
4286       = neutral_op_for_slp_reduction (slp_node_instance->reduc_phis, code,
4287                                       REDUC_GROUP_FIRST_ELEMENT (stmt_info));
4288   stmt_vec_info prev_phi_info;
4289   tree vectype;
4290   machine_mode mode;
4291   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4292   basic_block exit_bb;
4293   tree scalar_dest;
4294   tree scalar_type;
4295   gimple *new_phi = NULL, *phi;
4296   stmt_vec_info phi_info;
4297   gimple_stmt_iterator exit_gsi;
4298   tree vec_dest;
4299   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4300   gimple *epilog_stmt = NULL;
4301   gimple *exit_phi;
4302   tree bitsize;
4303   tree expr, def;
4304   tree orig_name, scalar_result;
4305   imm_use_iterator imm_iter, phi_imm_iter;
4306   use_operand_p use_p, phi_use_p;
4307   gimple *use_stmt;
4308   bool nested_in_vect_loop = false;
4309   auto_vec<gimple *> new_phis;
4310   int j, i;
4311   auto_vec<tree> scalar_results;
4312   unsigned int group_size = 1, k;
4313   auto_vec<gimple *> phis;
4314   bool slp_reduc = false;
4315   bool direct_slp_reduc;
4316   tree new_phi_result;
4317   tree induction_index = NULL_TREE;
4318
4319   if (slp_node)
4320     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4321
4322   if (nested_in_vect_loop_p (loop, stmt_info))
4323     {
4324       outer_loop = loop;
4325       loop = loop->inner;
4326       nested_in_vect_loop = true;
4327       gcc_assert (!slp_node);
4328     }
4329   gcc_assert (!nested_in_vect_loop || double_reduc);
4330
4331   vectype = STMT_VINFO_VECTYPE (stmt_info);
4332   gcc_assert (vectype);
4333   mode = TYPE_MODE (vectype);
4334
4335   tree initial_def = NULL;
4336   tree induc_val = NULL_TREE;
4337   tree adjustment_def = NULL;
4338   if (slp_node)
4339     ;
4340   else
4341     {
4342       /* Get at the scalar def before the loop, that defines the initial value
4343          of the reduction variable.  */
4344       initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4345                                            loop_preheader_edge (loop));
4346       /* Optimize: for induction condition reduction, if we can't use zero
4347          for induc_val, use initial_def.  */
4348       if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4349           == INTEGER_INDUC_COND_REDUCTION)
4350         induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (stmt_info);
4351       else if (double_reduc)
4352         ;
4353       else if (nested_in_vect_loop)
4354         ;
4355       else
4356         adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (stmt_info);
4357     }
4358
4359   unsigned vec_num;
4360   int ncopies;
4361   if (slp_node)
4362     {
4363       vec_num = SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis).length ();
4364       ncopies = 1;
4365     }
4366   else
4367     {
4368       vec_num = 1;
4369       ncopies = 0;
4370       phi_info = STMT_VINFO_VEC_STMT (loop_vinfo->lookup_stmt (reduc_def_stmt));
4371       do
4372         {
4373           ncopies++;
4374           phi_info = STMT_VINFO_RELATED_STMT (phi_info);
4375         }
4376       while (phi_info);
4377     }
4378
4379   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4380      which is updated with the current index of the loop for every match of
4381      the original loop's cond_expr (VEC_STMT).  This results in a vector
4382      containing the last time the condition passed for that vector lane.
4383      The first match will be a 1 to allow 0 to be used for non-matching
4384      indexes.  If there are no matches at all then the vector will be all
4385      zeroes.  */
4386   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4387     {
4388       tree indx_before_incr, indx_after_incr;
4389       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4390
4391       gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info)->stmt;
4392       gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4393
4394       int scalar_precision
4395         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4396       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4397       tree cr_index_vector_type = build_vector_type
4398         (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype));
4399
4400       /* First we create a simple vector induction variable which starts
4401          with the values {1,2,3,...} (SERIES_VECT) and increments by the
4402          vector size (STEP).  */
4403
4404       /* Create a {1,2,3,...} vector.  */
4405       tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
4406
4407       /* Create a vector of the step value.  */
4408       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4409       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4410
4411       /* Create an induction variable.  */
4412       gimple_stmt_iterator incr_gsi;
4413       bool insert_after;
4414       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4415       create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4416                  insert_after, &indx_before_incr, &indx_after_incr);
4417
4418       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4419          filled with zeros (VEC_ZERO).  */
4420
4421       /* Create a vector of 0s.  */
4422       tree zero = build_zero_cst (cr_index_scalar_type);
4423       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4424
4425       /* Create a vector phi node.  */
4426       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4427       new_phi = create_phi_node (new_phi_tree, loop->header);
4428       loop_vinfo->add_stmt (new_phi);
4429       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4430                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
4431
4432       /* Now take the condition from the loops original cond_expr
4433          (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
4434          every match uses values from the induction variable
4435          (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4436          (NEW_PHI_TREE).
4437          Finally, we update the phi (NEW_PHI_TREE) to take the value of
4438          the new cond_expr (INDEX_COND_EXPR).  */
4439
4440       /* Duplicate the condition from vec_stmt.  */
4441       tree ccompare = unshare_expr (gimple_assign_rhs1 (vec_stmt));
4442
4443       /* Create a conditional, where the condition is taken from vec_stmt
4444          (CCOMPARE).  The then and else values mirror the main VEC_COND_EXPR:
4445          the reduction phi corresponds to NEW_PHI_TREE and the new values
4446          correspond to INDEX_BEFORE_INCR.  */
4447       gcc_assert (STMT_VINFO_REDUC_IDX (stmt_info) >= 1);
4448       tree index_cond_expr;
4449       if (STMT_VINFO_REDUC_IDX (stmt_info) == 2)
4450         index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
4451                                   ccompare, indx_before_incr, new_phi_tree);
4452       else
4453         index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
4454                                   ccompare, new_phi_tree, indx_before_incr);
4455       induction_index = make_ssa_name (cr_index_vector_type);
4456       gimple *index_condition = gimple_build_assign (induction_index,
4457                                                      index_cond_expr);
4458       gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
4459       stmt_vec_info index_vec_info = loop_vinfo->add_stmt (index_condition);
4460       STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
4461
4462       /* Update the phi with the vec cond.  */
4463       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4464                    loop_latch_edge (loop), UNKNOWN_LOCATION);
4465     }
4466
4467   /* 2. Create epilog code.
4468         The reduction epilog code operates across the elements of the vector
4469         of partial results computed by the vectorized loop.
4470         The reduction epilog code consists of:
4471
4472         step 1: compute the scalar result in a vector (v_out2)
4473         step 2: extract the scalar result (s_out3) from the vector (v_out2)
4474         step 3: adjust the scalar result (s_out3) if needed.
4475
4476         Step 1 can be accomplished using one the following three schemes:
4477           (scheme 1) using reduc_fn, if available.
4478           (scheme 2) using whole-vector shifts, if available.
4479           (scheme 3) using a scalar loop. In this case steps 1+2 above are
4480                      combined.
4481
4482           The overall epilog code looks like this:
4483
4484           s_out0 = phi <s_loop>         # original EXIT_PHI
4485           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
4486           v_out2 = reduce <v_out1>              # step 1
4487           s_out3 = extract_field <v_out2, 0>    # step 2
4488           s_out4 = adjust_result <s_out3>       # step 3
4489
4490           (step 3 is optional, and steps 1 and 2 may be combined).
4491           Lastly, the uses of s_out0 are replaced by s_out4.  */
4492
4493
4494   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4495          v_out1 = phi <VECT_DEF>
4496          Store them in NEW_PHIS.  */
4497   if (double_reduc)
4498     loop = outer_loop;
4499   exit_bb = single_exit (loop)->dest;
4500   prev_phi_info = NULL;
4501   new_phis.create (slp_node ? vec_num : ncopies);
4502   for (unsigned i = 0; i < vec_num; i++)
4503     {
4504       if (slp_node)
4505         def = gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[i]->stmt);
4506       else
4507         def = gimple_get_lhs (STMT_VINFO_VEC_STMT (rdef_info)->stmt);
4508       for (j = 0; j < ncopies; j++)
4509         {
4510           tree new_def = copy_ssa_name (def);
4511           phi = create_phi_node (new_def, exit_bb);
4512           stmt_vec_info phi_info = loop_vinfo->add_stmt (phi);
4513           if (j == 0)
4514             new_phis.quick_push (phi);
4515           else
4516             {
4517               def = vect_get_vec_def_for_stmt_copy (loop_vinfo, def);
4518               STMT_VINFO_RELATED_STMT (prev_phi_info) = phi_info;
4519             }
4520
4521           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4522           prev_phi_info = phi_info;
4523         }
4524     }
4525
4526   exit_gsi = gsi_after_labels (exit_bb);
4527
4528   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4529          (i.e. when reduc_fn is not available) and in the final adjustment
4530          code (if needed).  Also get the original scalar reduction variable as
4531          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
4532          represents a reduction pattern), the tree-code and scalar-def are
4533          taken from the original stmt that the pattern-stmt (STMT) replaces.
4534          Otherwise (it is a regular reduction) - the tree-code and scalar-def
4535          are taken from STMT.  */
4536
4537   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4538   if (orig_stmt_info != stmt_info)
4539     {
4540       /* Reduction pattern  */
4541       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4542       gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
4543     }
4544
4545   scalar_dest = gimple_assign_lhs (orig_stmt_info->stmt);
4546   scalar_type = TREE_TYPE (scalar_dest);
4547   scalar_results.create (group_size);
4548   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4549   bitsize = TYPE_SIZE (scalar_type);
4550
4551   /* SLP reduction without reduction chain, e.g.,
4552      # a1 = phi <a2, a0>
4553      # b1 = phi <b2, b0>
4554      a2 = operation (a1)
4555      b2 = operation (b1)  */
4556   slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
4557
4558   /* True if we should implement SLP_REDUC using native reduction operations
4559      instead of scalar operations.  */
4560   direct_slp_reduc = (reduc_fn != IFN_LAST
4561                       && slp_reduc
4562                       && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
4563
4564   /* In case of reduction chain, e.g.,
4565      # a1 = phi <a3, a0>
4566      a2 = operation (a1)
4567      a3 = operation (a2),
4568
4569      we may end up with more than one vector result.  Here we reduce them to
4570      one vector.  */
4571   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) || direct_slp_reduc)
4572     {
4573       tree first_vect = PHI_RESULT (new_phis[0]);
4574       gassign *new_vec_stmt = NULL;
4575       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4576       for (k = 1; k < new_phis.length (); k++)
4577         {
4578           gimple *next_phi = new_phis[k];
4579           tree second_vect = PHI_RESULT (next_phi);
4580           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4581           new_vec_stmt = gimple_build_assign (tem, code,
4582                                               first_vect, second_vect);
4583           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4584           first_vect = tem;
4585         }
4586
4587       new_phi_result = first_vect;
4588       if (new_vec_stmt)
4589         {
4590           new_phis.truncate (0);
4591           new_phis.safe_push (new_vec_stmt);
4592         }
4593     }
4594   /* Likewise if we couldn't use a single defuse cycle.  */
4595   else if (ncopies > 1)
4596     {
4597       gcc_assert (new_phis.length () == 1);
4598       tree first_vect = PHI_RESULT (new_phis[0]);
4599       gassign *new_vec_stmt = NULL;
4600       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4601       stmt_vec_info next_phi_info = loop_vinfo->lookup_stmt (new_phis[0]);
4602       for (int k = 1; k < ncopies; ++k)
4603         {
4604           next_phi_info = STMT_VINFO_RELATED_STMT (next_phi_info);
4605           tree second_vect = PHI_RESULT (next_phi_info->stmt);
4606           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4607           new_vec_stmt = gimple_build_assign (tem, code,
4608                                               first_vect, second_vect);
4609           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4610           first_vect = tem;
4611         }
4612       new_phi_result = first_vect;
4613       new_phis.truncate (0);
4614       new_phis.safe_push (new_vec_stmt);
4615     }
4616   else
4617     new_phi_result = PHI_RESULT (new_phis[0]);
4618
4619   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4620       && reduc_fn != IFN_LAST)
4621     {
4622       /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4623          various data values where the condition matched and another vector
4624          (INDUCTION_INDEX) containing all the indexes of those matches.  We
4625          need to extract the last matching index (which will be the index with
4626          highest value) and use this to index into the data vector.
4627          For the case where there were no matches, the data vector will contain
4628          all default values and the index vector will be all zeros.  */
4629
4630       /* Get various versions of the type of the vector of indexes.  */
4631       tree index_vec_type = TREE_TYPE (induction_index);
4632       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4633       tree index_scalar_type = TREE_TYPE (index_vec_type);
4634       tree index_vec_cmp_type = build_same_sized_truth_vector_type
4635         (index_vec_type);
4636
4637       /* Get an unsigned integer version of the type of the data vector.  */
4638       int scalar_precision
4639         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
4640       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4641       tree vectype_unsigned = build_vector_type
4642         (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4643
4644       /* First we need to create a vector (ZERO_VEC) of zeros and another
4645          vector (MAX_INDEX_VEC) filled with the last matching index, which we
4646          can create using a MAX reduction and then expanding.
4647          In the case where the loop never made any matches, the max index will
4648          be zero.  */
4649
4650       /* Vector of {0, 0, 0,...}.  */
4651       tree zero_vec = make_ssa_name (vectype);
4652       tree zero_vec_rhs = build_zero_cst (vectype);
4653       gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
4654       gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
4655
4656       /* Find maximum value from the vector of found indexes.  */
4657       tree max_index = make_ssa_name (index_scalar_type);
4658       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4659                                                           1, induction_index);
4660       gimple_call_set_lhs (max_index_stmt, max_index);
4661       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
4662
4663       /* Vector of {max_index, max_index, max_index,...}.  */
4664       tree max_index_vec = make_ssa_name (index_vec_type);
4665       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
4666                                                       max_index);
4667       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
4668                                                         max_index_vec_rhs);
4669       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
4670
4671       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
4672          with the vector (INDUCTION_INDEX) of found indexes, choosing values
4673          from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
4674          otherwise.  Only one value should match, resulting in a vector
4675          (VEC_COND) with one data value and the rest zeros.
4676          In the case where the loop never made any matches, every index will
4677          match, resulting in a vector with all data values (which will all be
4678          the default value).  */
4679
4680       /* Compare the max index vector to the vector of found indexes to find
4681          the position of the max value.  */
4682       tree vec_compare = make_ssa_name (index_vec_cmp_type);
4683       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
4684                                                       induction_index,
4685                                                       max_index_vec);
4686       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
4687
4688       /* Use the compare to choose either values from the data vector or
4689          zero.  */
4690       tree vec_cond = make_ssa_name (vectype);
4691       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
4692                                                    vec_compare, new_phi_result,
4693                                                    zero_vec);
4694       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
4695
4696       /* Finally we need to extract the data value from the vector (VEC_COND)
4697          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
4698          reduction, but because this doesn't exist, we can use a MAX reduction
4699          instead.  The data value might be signed or a float so we need to cast
4700          it first.
4701          In the case where the loop never made any matches, the data values are
4702          all identical, and so will reduce down correctly.  */
4703
4704       /* Make the matched data values unsigned.  */
4705       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
4706       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
4707                                        vec_cond);
4708       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
4709                                                         VIEW_CONVERT_EXPR,
4710                                                         vec_cond_cast_rhs);
4711       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
4712
4713       /* Reduce down to a scalar value.  */
4714       tree data_reduc = make_ssa_name (scalar_type_unsigned);
4715       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4716                                                            1, vec_cond_cast);
4717       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
4718       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
4719
4720       /* Convert the reduced value back to the result type and set as the
4721          result.  */
4722       gimple_seq stmts = NULL;
4723       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
4724                                data_reduc);
4725       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4726       scalar_results.safe_push (new_temp);
4727     }
4728   else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4729            && reduc_fn == IFN_LAST)
4730     {
4731       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
4732          idx = 0;
4733          idx_val = induction_index[0];
4734          val = data_reduc[0];
4735          for (idx = 0, val = init, i = 0; i < nelts; ++i)
4736            if (induction_index[i] > idx_val)
4737              val = data_reduc[i], idx_val = induction_index[i];
4738          return val;  */
4739
4740       tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
4741       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
4742       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
4743       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
4744       /* Enforced by vectorizable_reduction, which ensures we have target
4745          support before allowing a conditional reduction on variable-length
4746          vectors.  */
4747       unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
4748       tree idx_val = NULL_TREE, val = NULL_TREE;
4749       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
4750         {
4751           tree old_idx_val = idx_val;
4752           tree old_val = val;
4753           idx_val = make_ssa_name (idx_eltype);
4754           epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
4755                                              build3 (BIT_FIELD_REF, idx_eltype,
4756                                                      induction_index,
4757                                                      bitsize_int (el_size),
4758                                                      bitsize_int (off)));
4759           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4760           val = make_ssa_name (data_eltype);
4761           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
4762                                              build3 (BIT_FIELD_REF,
4763                                                      data_eltype,
4764                                                      new_phi_result,
4765                                                      bitsize_int (el_size),
4766                                                      bitsize_int (off)));
4767           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4768           if (off != 0)
4769             {
4770               tree new_idx_val = idx_val;
4771               if (off != v_size - el_size)
4772                 {
4773                   new_idx_val = make_ssa_name (idx_eltype);
4774                   epilog_stmt = gimple_build_assign (new_idx_val,
4775                                                      MAX_EXPR, idx_val,
4776                                                      old_idx_val);
4777                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4778                 }
4779               tree new_val = make_ssa_name (data_eltype);
4780               epilog_stmt = gimple_build_assign (new_val,
4781                                                  COND_EXPR,
4782                                                  build2 (GT_EXPR,
4783                                                          boolean_type_node,
4784                                                          idx_val,
4785                                                          old_idx_val),
4786                                                  val, old_val);
4787               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4788               idx_val = new_idx_val;
4789               val = new_val;
4790             }
4791         }
4792       /* Convert the reduced value back to the result type and set as the
4793          result.  */
4794       gimple_seq stmts = NULL;
4795       val = gimple_convert (&stmts, scalar_type, val);
4796       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4797       scalar_results.safe_push (val);
4798     }
4799
4800   /* 2.3 Create the reduction code, using one of the three schemes described
4801          above. In SLP we simply need to extract all the elements from the
4802          vector (without reducing them), so we use scalar shifts.  */
4803   else if (reduc_fn != IFN_LAST && !slp_reduc)
4804     {
4805       tree tmp;
4806       tree vec_elem_type;
4807
4808       /* Case 1:  Create:
4809          v_out2 = reduc_expr <v_out1>  */
4810
4811       if (dump_enabled_p ())
4812         dump_printf_loc (MSG_NOTE, vect_location,
4813                          "Reduce using direct vector reduction.\n");
4814
4815       vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
4816       if (!useless_type_conversion_p (scalar_type, vec_elem_type))
4817         {
4818           tree tmp_dest
4819             = vect_create_destination_var (scalar_dest, vec_elem_type);
4820           epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
4821                                                     new_phi_result);
4822           gimple_set_lhs (epilog_stmt, tmp_dest);
4823           new_temp = make_ssa_name (tmp_dest, epilog_stmt);
4824           gimple_set_lhs (epilog_stmt, new_temp);
4825           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4826
4827           epilog_stmt = gimple_build_assign (new_scalar_dest, NOP_EXPR,
4828                                              new_temp);
4829         }
4830       else
4831         {
4832           epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
4833                                                     new_phi_result);
4834           gimple_set_lhs (epilog_stmt, new_scalar_dest);
4835         }
4836
4837       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4838       gimple_set_lhs (epilog_stmt, new_temp);
4839       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4840
4841       if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4842            == INTEGER_INDUC_COND_REDUCTION)
4843           && induc_val)
4844         {
4845           /* Earlier we set the initial value to be a vector if induc_val
4846              values.  Check the result and if it is induc_val then replace
4847              with the original initial value, unless induc_val is
4848              the same as initial_def already.  */
4849           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
4850                                   induc_val);
4851
4852           tmp = make_ssa_name (new_scalar_dest);
4853           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
4854                                              initial_def, new_temp);
4855           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4856           new_temp = tmp;
4857         }
4858
4859       scalar_results.safe_push (new_temp);
4860     }
4861   else if (direct_slp_reduc)
4862     {
4863       /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
4864          with the elements for other SLP statements replaced with the
4865          neutral value.  We can then do a normal reduction on each vector.  */
4866
4867       /* Enforced by vectorizable_reduction.  */
4868       gcc_assert (new_phis.length () == 1);
4869       gcc_assert (pow2p_hwi (group_size));
4870
4871       slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
4872       vec<stmt_vec_info> orig_phis
4873         = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
4874       gimple_seq seq = NULL;
4875
4876       /* Build a vector {0, 1, 2, ...}, with the same number of elements
4877          and the same element size as VECTYPE.  */
4878       tree index = build_index_vector (vectype, 0, 1);
4879       tree index_type = TREE_TYPE (index);
4880       tree index_elt_type = TREE_TYPE (index_type);
4881       tree mask_type = build_same_sized_truth_vector_type (index_type);
4882
4883       /* Create a vector that, for each element, identifies which of
4884          the REDUC_GROUP_SIZE results should use it.  */
4885       tree index_mask = build_int_cst (index_elt_type, group_size - 1);
4886       index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
4887                             build_vector_from_val (index_type, index_mask));
4888
4889       /* Get a neutral vector value.  This is simply a splat of the neutral
4890          scalar value if we have one, otherwise the initial scalar value
4891          is itself a neutral value.  */
4892       tree vector_identity = NULL_TREE;
4893       if (neutral_op)
4894         vector_identity = gimple_build_vector_from_val (&seq, vectype,
4895                                                         neutral_op);
4896       for (unsigned int i = 0; i < group_size; ++i)
4897         {
4898           /* If there's no univeral neutral value, we can use the
4899              initial scalar value from the original PHI.  This is used
4900              for MIN and MAX reduction, for example.  */
4901           if (!neutral_op)
4902             {
4903               tree scalar_value
4904                 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i]->stmt,
4905                                          loop_preheader_edge (loop));
4906               vector_identity = gimple_build_vector_from_val (&seq, vectype,
4907                                                               scalar_value);
4908             }
4909
4910           /* Calculate the equivalent of:
4911
4912              sel[j] = (index[j] == i);
4913
4914              which selects the elements of NEW_PHI_RESULT that should
4915              be included in the result.  */
4916           tree compare_val = build_int_cst (index_elt_type, i);
4917           compare_val = build_vector_from_val (index_type, compare_val);
4918           tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
4919                                    index, compare_val);
4920
4921           /* Calculate the equivalent of:
4922
4923              vec = seq ? new_phi_result : vector_identity;
4924
4925              VEC is now suitable for a full vector reduction.  */
4926           tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
4927                                    sel, new_phi_result, vector_identity);
4928
4929           /* Do the reduction and convert it to the appropriate type.  */
4930           tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
4931                                       TREE_TYPE (vectype), vec);
4932           scalar = gimple_convert (&seq, scalar_type, scalar);
4933           scalar_results.safe_push (scalar);
4934         }
4935       gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
4936     }
4937   else
4938     {
4939       bool reduce_with_shift;
4940       tree vec_temp;
4941
4942       /* See if the target wants to do the final (shift) reduction
4943          in a vector mode of smaller size and first reduce upper/lower
4944          halves against each other.  */
4945       enum machine_mode mode1 = mode;
4946       unsigned sz = tree_to_uhwi (TYPE_SIZE_UNIT (vectype));
4947       unsigned sz1 = sz;
4948       if (!slp_reduc
4949           && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
4950         sz1 = GET_MODE_SIZE (mode1).to_constant ();
4951
4952       tree vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz1);
4953       reduce_with_shift = have_whole_vector_shift (mode1);
4954       if (!VECTOR_MODE_P (mode1))
4955         reduce_with_shift = false;
4956       else
4957         {
4958           optab optab = optab_for_tree_code (code, vectype1, optab_default);
4959           if (optab_handler (optab, mode1) == CODE_FOR_nothing)
4960             reduce_with_shift = false;
4961         }
4962
4963       /* First reduce the vector to the desired vector size we should
4964          do shift reduction on by combining upper and lower halves.  */
4965       new_temp = new_phi_result;
4966       while (sz > sz1)
4967         {
4968           gcc_assert (!slp_reduc);
4969           sz /= 2;
4970           vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz);
4971
4972           /* The target has to make sure we support lowpart/highpart
4973              extraction, either via direct vector extract or through
4974              an integer mode punning.  */
4975           tree dst1, dst2;
4976           if (convert_optab_handler (vec_extract_optab,
4977                                      TYPE_MODE (TREE_TYPE (new_temp)),
4978                                      TYPE_MODE (vectype1))
4979               != CODE_FOR_nothing)
4980             {
4981               /* Extract sub-vectors directly once vec_extract becomes
4982                  a conversion optab.  */
4983               dst1 = make_ssa_name (vectype1);
4984               epilog_stmt
4985                   = gimple_build_assign (dst1, BIT_FIELD_REF,
4986                                          build3 (BIT_FIELD_REF, vectype1,
4987                                                  new_temp, TYPE_SIZE (vectype1),
4988                                                  bitsize_int (0)));
4989               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4990               dst2 =  make_ssa_name (vectype1);
4991               epilog_stmt
4992                   = gimple_build_assign (dst2, BIT_FIELD_REF,
4993                                          build3 (BIT_FIELD_REF, vectype1,
4994                                                  new_temp, TYPE_SIZE (vectype1),
4995                                                  bitsize_int (sz * BITS_PER_UNIT)));
4996               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4997             }
4998           else
4999             {
5000               /* Extract via punning to appropriately sized integer mode
5001                  vector.  */
5002               tree eltype = build_nonstandard_integer_type (sz * BITS_PER_UNIT,
5003                                                             1);
5004               tree etype = build_vector_type (eltype, 2);
5005               gcc_assert (convert_optab_handler (vec_extract_optab,
5006                                                  TYPE_MODE (etype),
5007                                                  TYPE_MODE (eltype))
5008                           != CODE_FOR_nothing);
5009               tree tem = make_ssa_name (etype);
5010               epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5011                                                  build1 (VIEW_CONVERT_EXPR,
5012                                                          etype, new_temp));
5013               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5014               new_temp = tem;
5015               tem = make_ssa_name (eltype);
5016               epilog_stmt
5017                   = gimple_build_assign (tem, BIT_FIELD_REF,
5018                                          build3 (BIT_FIELD_REF, eltype,
5019                                                  new_temp, TYPE_SIZE (eltype),
5020                                                  bitsize_int (0)));
5021               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5022               dst1 = make_ssa_name (vectype1);
5023               epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5024                                                  build1 (VIEW_CONVERT_EXPR,
5025                                                          vectype1, tem));
5026               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5027               tem = make_ssa_name (eltype);
5028               epilog_stmt
5029                   = gimple_build_assign (tem, BIT_FIELD_REF,
5030                                          build3 (BIT_FIELD_REF, eltype,
5031                                                  new_temp, TYPE_SIZE (eltype),
5032                                                  bitsize_int (sz * BITS_PER_UNIT)));
5033               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5034               dst2 =  make_ssa_name (vectype1);
5035               epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5036                                                  build1 (VIEW_CONVERT_EXPR,
5037                                                          vectype1, tem));
5038               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5039             }
5040
5041           new_temp = make_ssa_name (vectype1);
5042           epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5043           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5044         }
5045
5046       if (reduce_with_shift && !slp_reduc)
5047         {
5048           int element_bitsize = tree_to_uhwi (bitsize);
5049           /* Enforced by vectorizable_reduction, which disallows SLP reductions
5050              for variable-length vectors and also requires direct target support
5051              for loop reductions.  */
5052           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5053           int nelements = vec_size_in_bits / element_bitsize;
5054           vec_perm_builder sel;
5055           vec_perm_indices indices;
5056
5057           int elt_offset;
5058
5059           tree zero_vec = build_zero_cst (vectype1);
5060           /* Case 2: Create:
5061              for (offset = nelements/2; offset >= 1; offset/=2)
5062                 {
5063                   Create:  va' = vec_shift <va, offset>
5064                   Create:  va = vop <va, va'>
5065                 }  */
5066
5067           tree rhs;
5068
5069           if (dump_enabled_p ())
5070             dump_printf_loc (MSG_NOTE, vect_location,
5071                              "Reduce using vector shifts\n");
5072
5073           vec_dest = vect_create_destination_var (scalar_dest, vectype1);
5074           for (elt_offset = nelements / 2;
5075                elt_offset >= 1;
5076                elt_offset /= 2)
5077             {
5078               calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5079               indices.new_vector (sel, 2, nelements);
5080               tree mask = vect_gen_perm_mask_any (vectype1, indices);
5081               epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
5082                                                  new_temp, zero_vec, mask);
5083               new_name = make_ssa_name (vec_dest, epilog_stmt);
5084               gimple_assign_set_lhs (epilog_stmt, new_name);
5085               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5086
5087               epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
5088                                                  new_temp);
5089               new_temp = make_ssa_name (vec_dest, epilog_stmt);
5090               gimple_assign_set_lhs (epilog_stmt, new_temp);
5091               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5092             }
5093
5094           /* 2.4  Extract the final scalar result.  Create:
5095              s_out3 = extract_field <v_out2, bitpos>  */
5096
5097           if (dump_enabled_p ())
5098             dump_printf_loc (MSG_NOTE, vect_location,
5099                              "extract scalar result\n");
5100
5101           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5102                         bitsize, bitsize_zero_node);
5103           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5104           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5105           gimple_assign_set_lhs (epilog_stmt, new_temp);
5106           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5107           scalar_results.safe_push (new_temp);
5108         }
5109       else
5110         {
5111           /* Case 3: Create:
5112              s = extract_field <v_out2, 0>
5113              for (offset = element_size;
5114                   offset < vector_size;
5115                   offset += element_size;)
5116                {
5117                  Create:  s' = extract_field <v_out2, offset>
5118                  Create:  s = op <s, s'>  // For non SLP cases
5119                }  */
5120
5121           if (dump_enabled_p ())
5122             dump_printf_loc (MSG_NOTE, vect_location,
5123                              "Reduce using scalar code.\n");
5124
5125           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5126           int element_bitsize = tree_to_uhwi (bitsize);
5127           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5128             {
5129               int bit_offset;
5130               if (gimple_code (new_phi) == GIMPLE_PHI)
5131                 vec_temp = PHI_RESULT (new_phi);
5132               else
5133                 vec_temp = gimple_assign_lhs (new_phi);
5134               tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
5135                                  bitsize_zero_node);
5136               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5137               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5138               gimple_assign_set_lhs (epilog_stmt, new_temp);
5139               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5140
5141               /* In SLP we don't need to apply reduction operation, so we just
5142                  collect s' values in SCALAR_RESULTS.  */
5143               if (slp_reduc)
5144                 scalar_results.safe_push (new_temp);
5145
5146               for (bit_offset = element_bitsize;
5147                    bit_offset < vec_size_in_bits;
5148                    bit_offset += element_bitsize)
5149                 {
5150                   tree bitpos = bitsize_int (bit_offset);
5151                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
5152                                      bitsize, bitpos);
5153
5154                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5155                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
5156                   gimple_assign_set_lhs (epilog_stmt, new_name);
5157                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5158
5159                   if (slp_reduc)
5160                     {
5161                       /* In SLP we don't need to apply reduction operation, so
5162                          we just collect s' values in SCALAR_RESULTS.  */
5163                       new_temp = new_name;
5164                       scalar_results.safe_push (new_name);
5165                     }
5166                   else
5167                     {
5168                       epilog_stmt = gimple_build_assign (new_scalar_dest, code,
5169                                                          new_name, new_temp);
5170                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5171                       gimple_assign_set_lhs (epilog_stmt, new_temp);
5172                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5173                     }
5174                 }
5175             }
5176
5177           /* The only case where we need to reduce scalar results in SLP, is
5178              unrolling.  If the size of SCALAR_RESULTS is greater than
5179              REDUC_GROUP_SIZE, we reduce them combining elements modulo
5180              REDUC_GROUP_SIZE.  */
5181           if (slp_reduc)
5182             {
5183               tree res, first_res, new_res;
5184               gimple *new_stmt;
5185
5186               /* Reduce multiple scalar results in case of SLP unrolling.  */
5187               for (j = group_size; scalar_results.iterate (j, &res);
5188                    j++)
5189                 {
5190                   first_res = scalar_results[j % group_size];
5191                   new_stmt = gimple_build_assign (new_scalar_dest, code,
5192                                                   first_res, res);
5193                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
5194                   gimple_assign_set_lhs (new_stmt, new_res);
5195                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
5196                   scalar_results[j % group_size] = new_res;
5197                 }
5198             }
5199           else
5200             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
5201             scalar_results.safe_push (new_temp);
5202         }
5203
5204       if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5205            == INTEGER_INDUC_COND_REDUCTION)
5206           && induc_val)
5207         {
5208           /* Earlier we set the initial value to be a vector if induc_val
5209              values.  Check the result and if it is induc_val then replace
5210              with the original initial value, unless induc_val is
5211              the same as initial_def already.  */
5212           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5213                                   induc_val);
5214
5215           tree tmp = make_ssa_name (new_scalar_dest);
5216           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5217                                              initial_def, new_temp);
5218           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5219           scalar_results[0] = tmp;
5220         }
5221     }
5222
5223   /* 2.5 Adjust the final result by the initial value of the reduction
5224          variable. (When such adjustment is not needed, then
5225          'adjustment_def' is zero).  For example, if code is PLUS we create:
5226          new_temp = loop_exit_def + adjustment_def  */
5227
5228   if (adjustment_def)
5229     {
5230       gcc_assert (!slp_reduc);
5231       if (nested_in_vect_loop)
5232         {
5233           new_phi = new_phis[0];
5234           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
5235           expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
5236           new_dest = vect_create_destination_var (scalar_dest, vectype);
5237         }
5238       else
5239         {
5240           new_temp = scalar_results[0];
5241           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5242           expr = build2 (code, scalar_type, new_temp, adjustment_def);
5243           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
5244         }
5245
5246       epilog_stmt = gimple_build_assign (new_dest, expr);
5247       new_temp = make_ssa_name (new_dest, epilog_stmt);
5248       gimple_assign_set_lhs (epilog_stmt, new_temp);
5249       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5250       if (nested_in_vect_loop)
5251         {
5252           stmt_vec_info epilog_stmt_info = loop_vinfo->add_stmt (epilog_stmt);
5253           STMT_VINFO_RELATED_STMT (epilog_stmt_info)
5254             = STMT_VINFO_RELATED_STMT (loop_vinfo->lookup_stmt (new_phi));
5255
5256           if (!double_reduc)
5257             scalar_results.quick_push (new_temp);
5258           else
5259             scalar_results[0] = new_temp;
5260         }
5261       else
5262         scalar_results[0] = new_temp;
5263
5264       new_phis[0] = epilog_stmt;
5265     }
5266
5267   if (double_reduc)
5268     loop = loop->inner;
5269
5270   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
5271           phis with new adjusted scalar results, i.e., replace use <s_out0>
5272           with use <s_out4>.
5273
5274      Transform:
5275         loop_exit:
5276           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5277           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5278           v_out2 = reduce <v_out1>
5279           s_out3 = extract_field <v_out2, 0>
5280           s_out4 = adjust_result <s_out3>
5281           use <s_out0>
5282           use <s_out0>
5283
5284      into:
5285
5286         loop_exit:
5287           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5288           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5289           v_out2 = reduce <v_out1>
5290           s_out3 = extract_field <v_out2, 0>
5291           s_out4 = adjust_result <s_out3>
5292           use <s_out4>
5293           use <s_out4> */
5294
5295
5296   /* In SLP reduction chain we reduce vector results into one vector if
5297      necessary, hence we set here REDUC_GROUP_SIZE to 1.  SCALAR_DEST is the
5298      LHS of the last stmt in the reduction chain, since we are looking for
5299      the loop exit phi node.  */
5300   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5301     {
5302       stmt_vec_info dest_stmt_info
5303         = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
5304       scalar_dest = gimple_assign_lhs (dest_stmt_info->stmt);
5305       group_size = 1;
5306     }
5307
5308   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5309      case that REDUC_GROUP_SIZE is greater than vectorization factor).
5310      Therefore, we need to match SCALAR_RESULTS with corresponding statements.
5311      The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results
5312      correspond to the first vector stmt, etc.
5313      (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)).  */
5314   if (group_size > new_phis.length ())
5315     gcc_assert (!(group_size % new_phis.length ()));
5316
5317   for (k = 0; k < group_size; k++)
5318     {
5319       if (slp_reduc)
5320         {
5321           stmt_vec_info scalar_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5322
5323           orig_stmt_info = STMT_VINFO_RELATED_STMT (scalar_stmt_info);
5324           /* SLP statements can't participate in patterns.  */
5325           gcc_assert (!orig_stmt_info);
5326           scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt);
5327         }
5328
5329       if (nested_in_vect_loop)
5330         {
5331           if (double_reduc)
5332             loop = outer_loop;
5333           else
5334             gcc_unreachable ();
5335         }
5336
5337       phis.create (3);
5338       /* Find the loop-closed-use at the loop exit of the original scalar
5339          result.  (The reduction result is expected to have two immediate uses,
5340          one at the latch block, and one at the loop exit).  For double
5341          reductions we are looking for exit phis of the outer loop.  */
5342       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5343         {
5344           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5345             {
5346               if (!is_gimple_debug (USE_STMT (use_p)))
5347                 phis.safe_push (USE_STMT (use_p));
5348             }
5349           else
5350             {
5351               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5352                 {
5353                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
5354
5355                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5356                     {
5357                       if (!flow_bb_inside_loop_p (loop,
5358                                              gimple_bb (USE_STMT (phi_use_p)))
5359                           && !is_gimple_debug (USE_STMT (phi_use_p)))
5360                         phis.safe_push (USE_STMT (phi_use_p));
5361                     }
5362                 }
5363             }
5364         }
5365
5366       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5367         {
5368           /* Replace the uses:  */
5369           orig_name = PHI_RESULT (exit_phi);
5370           scalar_result = scalar_results[k];
5371           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5372             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5373               SET_USE (use_p, scalar_result);
5374         }
5375
5376       phis.release ();
5377     }
5378 }
5379
5380 /* Return a vector of type VECTYPE that is equal to the vector select
5381    operation "MASK ? VEC : IDENTITY".  Insert the select statements
5382    before GSI.  */
5383
5384 static tree
5385 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
5386                      tree vec, tree identity)
5387 {
5388   tree cond = make_temp_ssa_name (vectype, NULL, "cond");
5389   gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
5390                                           mask, vec, identity);
5391   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5392   return cond;
5393 }
5394
5395 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
5396    order, starting with LHS.  Insert the extraction statements before GSI and
5397    associate the new scalar SSA names with variable SCALAR_DEST.
5398    Return the SSA name for the result.  */
5399
5400 static tree
5401 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
5402                        tree_code code, tree lhs, tree vector_rhs)
5403 {
5404   tree vectype = TREE_TYPE (vector_rhs);
5405   tree scalar_type = TREE_TYPE (vectype);
5406   tree bitsize = TYPE_SIZE (scalar_type);
5407   unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5408   unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
5409
5410   for (unsigned HOST_WIDE_INT bit_offset = 0;
5411        bit_offset < vec_size_in_bits;
5412        bit_offset += element_bitsize)
5413     {
5414       tree bitpos = bitsize_int (bit_offset);
5415       tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
5416                          bitsize, bitpos);
5417
5418       gassign *stmt = gimple_build_assign (scalar_dest, rhs);
5419       rhs = make_ssa_name (scalar_dest, stmt);
5420       gimple_assign_set_lhs (stmt, rhs);
5421       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5422
5423       stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
5424       tree new_name = make_ssa_name (scalar_dest, stmt);
5425       gimple_assign_set_lhs (stmt, new_name);
5426       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5427       lhs = new_name;
5428     }
5429   return lhs;
5430 }
5431
5432 /* Get a masked internal function equivalent to REDUC_FN.  VECTYPE_IN is the
5433    type of the vector input.  */
5434
5435 static internal_fn
5436 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
5437 {
5438   internal_fn mask_reduc_fn;
5439
5440   switch (reduc_fn)
5441     {
5442     case IFN_FOLD_LEFT_PLUS:
5443       mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
5444       break;
5445
5446     default:
5447       return IFN_LAST;
5448     }
5449
5450   if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
5451                                       OPTIMIZE_FOR_SPEED))
5452     return mask_reduc_fn;
5453   return IFN_LAST;
5454 }
5455
5456 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT_INFO is the
5457    statement that sets the live-out value.  REDUC_DEF_STMT is the phi
5458    statement.  CODE is the operation performed by STMT_INFO and OPS are
5459    its scalar operands.  REDUC_INDEX is the index of the operand in
5460    OPS that is set by REDUC_DEF_STMT.  REDUC_FN is the function that
5461    implements in-order reduction, or IFN_LAST if we should open-code it.
5462    VECTYPE_IN is the type of the vector input.  MASKS specifies the masks
5463    that should be used to control the operation in a fully-masked loop.  */
5464
5465 static bool
5466 vectorize_fold_left_reduction (stmt_vec_info stmt_info,
5467                                gimple_stmt_iterator *gsi,
5468                                stmt_vec_info *vec_stmt, slp_tree slp_node,
5469                                gimple *reduc_def_stmt,
5470                                tree_code code, internal_fn reduc_fn,
5471                                tree ops[3], tree vectype_in,
5472                                int reduc_index, vec_loop_masks *masks)
5473 {
5474   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5475   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5476   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5477   stmt_vec_info new_stmt_info = NULL;
5478   internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
5479
5480   int ncopies;
5481   if (slp_node)
5482     ncopies = 1;
5483   else
5484     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
5485
5486   gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
5487   gcc_assert (ncopies == 1);
5488   gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
5489   gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5490               == FOLD_LEFT_REDUCTION);
5491
5492   if (slp_node)
5493     gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
5494                           TYPE_VECTOR_SUBPARTS (vectype_in)));
5495
5496   tree op0 = ops[1 - reduc_index];
5497
5498   int group_size = 1;
5499   stmt_vec_info scalar_dest_def_info;
5500   auto_vec<tree> vec_oprnds0;
5501   if (slp_node)
5502     {
5503       auto_vec<vec<tree> > vec_defs (2);
5504       auto_vec<tree> sops(2);
5505       sops.quick_push (ops[0]);
5506       sops.quick_push (ops[1]);
5507       vect_get_slp_defs (sops, slp_node, &vec_defs);
5508       vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
5509       vec_defs[0].release ();
5510       vec_defs[1].release ();
5511       group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
5512       scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5513     }
5514   else
5515     {
5516       tree loop_vec_def0 = vect_get_vec_def_for_operand (op0, stmt_info);
5517       vec_oprnds0.create (1);
5518       vec_oprnds0.quick_push (loop_vec_def0);
5519       scalar_dest_def_info = stmt_info;
5520     }
5521
5522   tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
5523   tree scalar_type = TREE_TYPE (scalar_dest);
5524   tree reduc_var = gimple_phi_result (reduc_def_stmt);
5525
5526   int vec_num = vec_oprnds0.length ();
5527   gcc_assert (vec_num == 1 || slp_node);
5528   tree vec_elem_type = TREE_TYPE (vectype_out);
5529   gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
5530
5531   tree vector_identity = NULL_TREE;
5532   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5533     vector_identity = build_zero_cst (vectype_out);
5534
5535   tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
5536   int i;
5537   tree def0;
5538   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5539     {
5540       gimple *new_stmt;
5541       tree mask = NULL_TREE;
5542       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5543         mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
5544
5545       /* Handle MINUS by adding the negative.  */
5546       if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
5547         {
5548           tree negated = make_ssa_name (vectype_out);
5549           new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
5550           gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5551           def0 = negated;
5552         }
5553
5554       if (mask && mask_reduc_fn == IFN_LAST)
5555         def0 = merge_with_identity (gsi, mask, vectype_out, def0,
5556                                     vector_identity);
5557
5558       /* On the first iteration the input is simply the scalar phi
5559          result, and for subsequent iterations it is the output of
5560          the preceding operation.  */
5561       if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
5562         {
5563           if (mask && mask_reduc_fn != IFN_LAST)
5564             new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
5565                                                    def0, mask);
5566           else
5567             new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
5568                                                    def0);
5569           /* For chained SLP reductions the output of the previous reduction
5570              operation serves as the input of the next. For the final statement
5571              the output cannot be a temporary - we reuse the original
5572              scalar destination of the last statement.  */
5573           if (i != vec_num - 1)
5574             {
5575               gimple_set_lhs (new_stmt, scalar_dest_var);
5576               reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
5577               gimple_set_lhs (new_stmt, reduc_var);
5578             }
5579         }
5580       else
5581         {
5582           reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
5583                                              reduc_var, def0);
5584           new_stmt = SSA_NAME_DEF_STMT (reduc_var);
5585           /* Remove the statement, so that we can use the same code paths
5586              as for statements that we've just created.  */
5587           gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
5588           gsi_remove (&tmp_gsi, true);
5589         }
5590
5591       if (i == vec_num - 1)
5592         {
5593           gimple_set_lhs (new_stmt, scalar_dest);
5594           new_stmt_info = vect_finish_replace_stmt (scalar_dest_def_info,
5595                                                     new_stmt);
5596         }
5597       else
5598         new_stmt_info = vect_finish_stmt_generation (scalar_dest_def_info,
5599                                                      new_stmt, gsi);
5600
5601       if (slp_node)
5602         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
5603     }
5604
5605   if (!slp_node)
5606     STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
5607
5608   return true;
5609 }
5610
5611 /* Function is_nonwrapping_integer_induction.
5612
5613    Check if STMT_VINO (which is part of loop LOOP) both increments and
5614    does not cause overflow.  */
5615
5616 static bool
5617 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
5618 {
5619   gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
5620   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
5621   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
5622   tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
5623   widest_int ni, max_loop_value, lhs_max;
5624   wi::overflow_type overflow = wi::OVF_NONE;
5625
5626   /* Make sure the loop is integer based.  */
5627   if (TREE_CODE (base) != INTEGER_CST
5628       || TREE_CODE (step) != INTEGER_CST)
5629     return false;
5630
5631   /* Check that the max size of the loop will not wrap.  */
5632
5633   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
5634     return true;
5635
5636   if (! max_stmt_executions (loop, &ni))
5637     return false;
5638
5639   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
5640                             &overflow);
5641   if (overflow)
5642     return false;
5643
5644   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
5645                             TYPE_SIGN (lhs_type), &overflow);
5646   if (overflow)
5647     return false;
5648
5649   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
5650           <= TYPE_PRECISION (lhs_type));
5651 }
5652
5653 /* Check if masking can be supported by inserting a conditional expression.
5654    CODE is the code for the operation.  COND_FN is the conditional internal
5655    function, if it exists.  VECTYPE_IN is the type of the vector input.  */
5656 static bool
5657 use_mask_by_cond_expr_p (enum tree_code code, internal_fn cond_fn,
5658                          tree vectype_in)
5659 {
5660   if (cond_fn != IFN_LAST
5661       && direct_internal_fn_supported_p (cond_fn, vectype_in,
5662                                          OPTIMIZE_FOR_SPEED))
5663     return false;
5664
5665   switch (code)
5666     {
5667     case DOT_PROD_EXPR:
5668     case SAD_EXPR:
5669       return true;
5670
5671     default:
5672       return false;
5673     }
5674 }
5675
5676 /* Insert a conditional expression to enable masked vectorization.  CODE is the
5677    code for the operation.  VOP is the array of operands.  MASK is the loop
5678    mask.  GSI is a statement iterator used to place the new conditional
5679    expression.  */
5680 static void
5681 build_vect_cond_expr (enum tree_code code, tree vop[3], tree mask,
5682                       gimple_stmt_iterator *gsi)
5683 {
5684   switch (code)
5685     {
5686     case DOT_PROD_EXPR:
5687       {
5688         tree vectype = TREE_TYPE (vop[1]);
5689         tree zero = build_zero_cst (vectype);
5690         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
5691         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
5692                                                mask, vop[1], zero);
5693         gsi_insert_before (gsi, select, GSI_SAME_STMT);
5694         vop[1] = masked_op1;
5695         break;
5696       }
5697
5698     case SAD_EXPR:
5699       {
5700         tree vectype = TREE_TYPE (vop[1]);
5701         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
5702         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
5703                                                mask, vop[1], vop[0]);
5704         gsi_insert_before (gsi, select, GSI_SAME_STMT);
5705         vop[1] = masked_op1;
5706         break;
5707       }
5708
5709     default:
5710       gcc_unreachable ();
5711     }
5712 }
5713
5714 /* Function vectorizable_reduction.
5715
5716    Check if STMT_INFO performs a reduction operation that can be vectorized.
5717    If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
5718    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
5719    Return true if STMT_INFO is vectorizable in this way.
5720
5721    This function also handles reduction idioms (patterns) that have been
5722    recognized in advance during vect_pattern_recog.  In this case, STMT_INFO
5723    may be of this form:
5724      X = pattern_expr (arg0, arg1, ..., X)
5725    and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
5726    sequence that had been detected and replaced by the pattern-stmt
5727    (STMT_INFO).
5728
5729    This function also handles reduction of condition expressions, for example:
5730      for (int i = 0; i < N; i++)
5731        if (a[i] < value)
5732          last = a[i];
5733    This is handled by vectorising the loop and creating an additional vector
5734    containing the loop indexes for which "a[i] < value" was true.  In the
5735    function epilogue this is reduced to a single max value and then used to
5736    index into the vector of results.
5737
5738    In some cases of reduction patterns, the type of the reduction variable X is
5739    different than the type of the other arguments of STMT_INFO.
5740    In such cases, the vectype that is used when transforming STMT_INFO into
5741    a vector stmt is different than the vectype that is used to determine the
5742    vectorization factor, because it consists of a different number of elements
5743    than the actual number of elements that are being operated upon in parallel.
5744
5745    For example, consider an accumulation of shorts into an int accumulator.
5746    On some targets it's possible to vectorize this pattern operating on 8
5747    shorts at a time (hence, the vectype for purposes of determining the
5748    vectorization factor should be V8HI); on the other hand, the vectype that
5749    is used to create the vector form is actually V4SI (the type of the result).
5750
5751    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
5752    indicates what is the actual level of parallelism (V8HI in the example), so
5753    that the right vectorization factor would be derived.  This vectype
5754    corresponds to the type of arguments to the reduction stmt, and should *NOT*
5755    be used to create the vectorized stmt.  The right vectype for the vectorized
5756    stmt is obtained from the type of the result X:
5757         get_vectype_for_scalar_type (TREE_TYPE (X))
5758
5759    This means that, contrary to "regular" reductions (or "regular" stmts in
5760    general), the following equation:
5761       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
5762    does *NOT* necessarily hold for reduction patterns.  */
5763
5764 bool
5765 vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
5766                         stmt_vec_info *vec_stmt, slp_tree slp_node,
5767                         slp_instance slp_node_instance,
5768                         stmt_vector_for_cost *cost_vec)
5769 {
5770   tree scalar_dest;
5771   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5772   tree vectype_in = NULL_TREE;
5773   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5774   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5775   enum tree_code code;
5776   internal_fn reduc_fn;
5777   machine_mode vec_mode;
5778   int op_type;
5779   optab optab;
5780   enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type;
5781   stmt_vec_info cond_stmt_vinfo = NULL;
5782   tree scalar_type;
5783   bool is_simple_use;
5784   int i;
5785   int ncopies;
5786   stmt_vec_info prev_phi_info;
5787   bool single_defuse_cycle = false;
5788   int j;
5789   tree ops[3];
5790   enum vect_def_type dts[3];
5791   bool nested_cycle = false, found_nested_cycle_def = false;
5792   bool double_reduc = false;
5793   int vec_num;
5794   tree tem;
5795   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
5796   tree cond_reduc_val = NULL_TREE;
5797
5798   /* Make sure it was already recognized as a reduction computation.  */
5799   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
5800       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
5801       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
5802     return false;
5803
5804   if (nested_in_vect_loop_p (loop, stmt_info))
5805     {
5806       loop = loop->inner;
5807       nested_cycle = true;
5808     }
5809
5810   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5811     gcc_assert (slp_node
5812                 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
5813
5814   if (gphi *phi = dyn_cast <gphi *> (stmt_info->stmt))
5815     {
5816       tree phi_result = gimple_phi_result (phi);
5817       /* Analysis is fully done on the reduction stmt invocation.  */
5818       if (! vec_stmt)
5819         {
5820           if (slp_node)
5821             slp_node_instance->reduc_phis = slp_node;
5822
5823           STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
5824           return true;
5825         }
5826
5827       if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
5828         /* Leave the scalar phi in place.  Note that checking
5829            STMT_VINFO_VEC_REDUCTION_TYPE (as below) only works
5830            for reductions involving a single statement.  */
5831         return true;
5832
5833       stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5834       reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
5835
5836       if (STMT_VINFO_VEC_REDUCTION_TYPE (reduc_stmt_info)
5837           == EXTRACT_LAST_REDUCTION)
5838         /* Leave the scalar phi in place.  */
5839         return true;
5840
5841       if (gassign *reduc_stmt = dyn_cast <gassign *> (reduc_stmt_info->stmt))
5842         for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k)
5843           {
5844             tree op = gimple_op (reduc_stmt, k);
5845             if (op == phi_result)
5846               continue;
5847             if (k == 1 && gimple_assign_rhs_code (reduc_stmt) == COND_EXPR)
5848               continue;
5849             bool is_simple_use = vect_is_simple_use (op, loop_vinfo, &dt);
5850             gcc_assert (is_simple_use);
5851             if (dt == vect_constant_def || dt == vect_external_def)
5852               continue;
5853             if (!vectype_in
5854                 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
5855                     < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (op)))))
5856               vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op));
5857             break;
5858           }
5859       /* For a nested cycle we might end up with an operation like
5860          phi_result * phi_result.  */
5861       if (!vectype_in)
5862         vectype_in = STMT_VINFO_VECTYPE (stmt_info);
5863       gcc_assert (vectype_in);
5864
5865       if (slp_node)
5866         {
5867           /* The size vect_schedule_slp_instance computes is off for us.  */
5868           vec_num = vect_get_num_vectors
5869               (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5870                * SLP_TREE_SCALAR_STMTS (slp_node).length (), vectype_in);
5871           ncopies = 1;
5872         }
5873       else
5874         {
5875           vec_num = 1;
5876           ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
5877         }
5878
5879       /* Check whether we can use a single PHI node and accumulate
5880          vectors to one before the backedge.  */
5881       stmt_vec_info use_stmt_info;
5882       if (ncopies > 1
5883           && STMT_VINFO_RELEVANT (reduc_stmt_info) <= vect_used_only_live
5884           && (use_stmt_info = loop_vinfo->lookup_single_use (phi_result))
5885           && (!STMT_VINFO_IN_PATTERN_P (use_stmt_info)
5886               || !STMT_VINFO_PATTERN_DEF_SEQ (use_stmt_info))
5887           && vect_stmt_to_vectorize (use_stmt_info) == reduc_stmt_info)
5888         {
5889           single_defuse_cycle = true;
5890           ncopies = 1;
5891         }
5892
5893       /* Create the destination vector  */
5894       tree vec_dest = vect_create_destination_var (phi_result, vectype_out);
5895
5896       /* Get the loop-entry arguments.  */
5897       tree vec_initial_def;
5898       auto_vec<tree> vec_initial_defs;
5899       if (slp_node)
5900         {
5901           vec_initial_defs.reserve (vec_num);
5902           gcc_assert (slp_node == slp_node_instance->reduc_phis);
5903           stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info);
5904           tree neutral_op
5905               = neutral_op_for_slp_reduction (slp_node,
5906                                               STMT_VINFO_REDUC_CODE
5907                                               (first ? first : reduc_stmt_info),
5908                                               first != NULL);
5909           get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
5910                                           &vec_initial_defs, vec_num,
5911                                           first != NULL, neutral_op);
5912         }
5913       else
5914         {
5915           /* Get at the scalar def before the loop, that defines the initial
5916              value of the reduction variable.  */
5917           tree initial_def = PHI_ARG_DEF_FROM_EDGE (phi,
5918                                                     loop_preheader_edge (loop));
5919           /* Optimize: if initial_def is for REDUC_MAX smaller than the base
5920              and we can't use zero for induc_val, use initial_def.  Similarly
5921              for REDUC_MIN and initial_def larger than the base.  */
5922           if (STMT_VINFO_VEC_REDUCTION_TYPE (reduc_stmt_info)
5923               == INTEGER_INDUC_COND_REDUCTION)
5924             {
5925               tree induc_val
5926                 = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_stmt_info);
5927               if (TREE_CODE (initial_def) == INTEGER_CST
5928                   && (STMT_VINFO_VEC_REDUCTION_TYPE (reduc_stmt_info)
5929                       == INTEGER_INDUC_COND_REDUCTION)
5930                   && !integer_zerop (induc_val)
5931                   && (((STMT_VINFO_VEC_COND_REDUC_CODE (reduc_stmt_info)
5932                         == MAX_EXPR)
5933                        && tree_int_cst_lt (initial_def, induc_val))
5934                       || ((STMT_VINFO_VEC_COND_REDUC_CODE (reduc_stmt_info)
5935                            == MIN_EXPR)
5936                           && tree_int_cst_lt (induc_val, initial_def))))
5937                 {
5938                   induc_val = initial_def;
5939                   /* Communicate we used the initial_def to epilouge
5940                      generation.  */
5941                   STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_stmt_info)
5942                     = NULL_TREE;
5943                 }
5944               vec_initial_def = build_vector_from_val (vectype_out, induc_val);
5945             }
5946           else if (nested_cycle)
5947             {
5948               /* Do not use an adjustment def as that case is not supported
5949                  correctly if ncopies is not one.  */
5950               vec_initial_def = vect_get_vec_def_for_operand (initial_def,
5951                                                               reduc_stmt_info);
5952             }
5953           else
5954             {
5955               tree adjustment_def = NULL_TREE;
5956               tree *adjustment_defp = &adjustment_def;
5957               enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_stmt_info);
5958               /* ???  For the outer loop PHI we have to do a bit of searching
5959                  to find the stmt with the code.  reduc_stmt_info here is the
5960                  loop-closed PHI of the inner reduction which means we can look
5961                  at its single-arg def.  */
5962               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5963                 {
5964                   tree def = gimple_phi_arg_def
5965                                 (as_a <gphi *> (reduc_stmt_info->stmt), 0);
5966                   code = STMT_VINFO_REDUC_CODE
5967                       (vect_stmt_to_vectorize (loop_vinfo->lookup_def (def)));
5968                   adjustment_defp = NULL;
5969                 }
5970               vec_initial_def
5971                 = get_initial_def_for_reduction (reduc_stmt_info, code,
5972                                                  initial_def, adjustment_defp);
5973               STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_stmt_info)
5974                 = adjustment_def;
5975             }
5976           vec_initial_defs.create (1);
5977           vec_initial_defs.quick_push (vec_initial_def);
5978         }
5979
5980       /* Generate the reduction PHIs upfront.  */
5981       prev_phi_info = NULL;
5982       for (i = 0; i < vec_num; i++)
5983         {
5984           tree vec_init_def = vec_initial_defs[i];
5985           for (j = 0; j < ncopies; j++)
5986             {
5987               /* Create the reduction-phi that defines the reduction
5988                  operand.  */
5989               gphi *new_phi = create_phi_node (vec_dest, loop->header);
5990               stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
5991
5992               /* Set the loop-entry arg of the reduction-phi.  */
5993               if (j != 0 && nested_cycle)
5994                 vec_init_def = vect_get_vec_def_for_stmt_copy (loop_vinfo,
5995                                                                vec_init_def);
5996               add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
5997                            UNKNOWN_LOCATION);
5998
5999               /* The loop-latch arg is set in epilogue processing.  */
6000
6001               if (slp_node)
6002                 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi_info);
6003               else
6004                 {
6005                   if (j == 0)
6006                     STMT_VINFO_VEC_STMT (stmt_info)
6007                         = *vec_stmt = new_phi_info;
6008                   else
6009                     STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi_info;
6010                   prev_phi_info = new_phi_info;
6011                 }
6012             }
6013         }
6014
6015       return true;
6016     }
6017
6018   /* 1. Is vectorizable reduction?  */
6019   /* Not supportable if the reduction variable is used in the loop, unless
6020      it's a reduction chain.  */
6021   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6022       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6023     return false;
6024
6025   /* Reductions that are not used even in an enclosing outer-loop,
6026      are expected to be "live" (used out of the loop).  */
6027   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6028       && !STMT_VINFO_LIVE_P (stmt_info))
6029     return false;
6030
6031   /* 2. Has this been recognized as a reduction pattern?
6032
6033      Check if STMT represents a pattern that has been recognized
6034      in earlier analysis stages.  For stmts that represent a pattern,
6035      the STMT_VINFO_RELATED_STMT field records the last stmt in
6036      the original sequence that constitutes the pattern.  */
6037
6038   stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6039   if (orig_stmt_info)
6040     {
6041       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6042       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6043     }
6044
6045   /* 3. Check the operands of the operation.  The first operands are defined
6046         inside the loop body. The last operand is the reduction variable,
6047         which is defined by the loop-header-phi.  */
6048
6049   gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6050
6051   /* Flatten RHS.  */
6052   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
6053     {
6054     case GIMPLE_BINARY_RHS:
6055       code = gimple_assign_rhs_code (stmt);
6056       op_type = TREE_CODE_LENGTH (code);
6057       gcc_assert (op_type == binary_op);
6058       ops[0] = gimple_assign_rhs1 (stmt);
6059       ops[1] = gimple_assign_rhs2 (stmt);
6060       break;
6061
6062     case GIMPLE_TERNARY_RHS:
6063       code = gimple_assign_rhs_code (stmt);
6064       op_type = TREE_CODE_LENGTH (code);
6065       gcc_assert (op_type == ternary_op);
6066       ops[0] = gimple_assign_rhs1 (stmt);
6067       ops[1] = gimple_assign_rhs2 (stmt);
6068       ops[2] = gimple_assign_rhs3 (stmt);
6069       break;
6070
6071     case GIMPLE_UNARY_RHS:
6072       return false;
6073
6074     default:
6075       gcc_unreachable ();
6076     }
6077
6078   if (code == COND_EXPR && slp_node)
6079     return false;
6080
6081   scalar_dest = gimple_assign_lhs (stmt);
6082   scalar_type = TREE_TYPE (scalar_dest);
6083   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6084       && !SCALAR_FLOAT_TYPE_P (scalar_type))
6085     return false;
6086
6087   /* Do not try to vectorize bit-precision reductions.  */
6088   if (!type_has_mode_precision_p (scalar_type))
6089     return false;
6090
6091   /* All uses but the last are expected to be defined in the loop.
6092      The last use is the reduction variable.  In case of nested cycle this
6093      assumption is not true: we use reduc_index to record the index of the
6094      reduction variable.  */
6095   stmt_vec_info reduc_def_info;
6096   if (orig_stmt_info)
6097     reduc_def_info = STMT_VINFO_REDUC_DEF (orig_stmt_info);
6098   else
6099     reduc_def_info = STMT_VINFO_REDUC_DEF (stmt_info);
6100   gcc_assert (reduc_def_info);
6101   gphi *reduc_def_phi = as_a <gphi *> (reduc_def_info->stmt);
6102   tree reduc_def = PHI_RESULT (reduc_def_phi);
6103   int reduc_index = -1;
6104   for (i = 0; i < op_type; i++)
6105     {
6106       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
6107       if (i == 0 && code == COND_EXPR)
6108         continue;
6109
6110       stmt_vec_info def_stmt_info;
6111       is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, &dts[i], &tem,
6112                                           &def_stmt_info);
6113       dt = dts[i];
6114       gcc_assert (is_simple_use);
6115       if (dt == vect_reduction_def
6116           && ops[i] == reduc_def)
6117         {
6118           reduc_index = i;
6119           continue;
6120         }
6121       else if (tem)
6122         {
6123           /* To properly compute ncopies we are interested in the widest
6124              input type in case we're looking at a widening accumulation.  */
6125           if (!vectype_in
6126               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6127                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem)))))
6128             vectype_in = tem;
6129         }
6130
6131       if (dt != vect_internal_def
6132           && dt != vect_external_def
6133           && dt != vect_constant_def
6134           && dt != vect_induction_def
6135           && !(dt == vect_nested_cycle && nested_cycle))
6136         return false;
6137
6138       if (dt == vect_nested_cycle
6139           && ops[i] == reduc_def)
6140         {
6141           found_nested_cycle_def = true;
6142           reduc_index = i;
6143         }
6144
6145       if (code == COND_EXPR)
6146         {
6147           /* Record how the non-reduction-def value of COND_EXPR is defined.  */
6148           if (dt == vect_constant_def)
6149             {
6150               cond_reduc_dt = dt;
6151               cond_reduc_val = ops[i];
6152             }
6153           if (dt == vect_induction_def
6154               && def_stmt_info
6155               && is_nonwrapping_integer_induction (def_stmt_info, loop))
6156             {
6157               cond_reduc_dt = dt;
6158               cond_stmt_vinfo = def_stmt_info;
6159             }
6160         }
6161     }
6162
6163   if (!vectype_in)
6164     vectype_in = vectype_out;
6165
6166   /* When vectorizing a reduction chain w/o SLP the reduction PHI is not
6167      directy used in stmt.  */
6168   if (reduc_index == -1)
6169     {
6170       if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6171         {
6172           if (dump_enabled_p ())
6173             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6174                              "in-order reduction chain without SLP.\n");
6175           return false;
6176         }
6177     }
6178
6179   if (!(reduc_index == -1
6180         || dts[reduc_index] == vect_reduction_def
6181         || dts[reduc_index] == vect_nested_cycle
6182         || ((dts[reduc_index] == vect_internal_def
6183              || dts[reduc_index] == vect_external_def
6184              || dts[reduc_index] == vect_constant_def
6185              || dts[reduc_index] == vect_induction_def)
6186             && nested_cycle && found_nested_cycle_def)))
6187     {
6188       /* For pattern recognized stmts, orig_stmt might be a reduction,
6189          but some helper statements for the pattern might not, or
6190          might be COND_EXPRs with reduction uses in the condition.  */
6191       gcc_assert (orig_stmt_info);
6192       return false;
6193     }
6194
6195   /* PHIs should not participate in patterns.  */
6196   gcc_assert (!STMT_VINFO_RELATED_STMT (reduc_def_info));
6197   enum vect_reduction_type v_reduc_type
6198     = STMT_VINFO_REDUC_TYPE (reduc_def_info);
6199   stmt_vec_info tmp = STMT_VINFO_REDUC_DEF (reduc_def_info);
6200
6201   if (!vec_stmt)
6202     STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
6203   /* If we have a condition reduction, see if we can simplify it further.  */
6204   if (v_reduc_type == COND_REDUCTION
6205       && !vec_stmt)
6206     {
6207       /* TODO: We can't yet handle reduction chains, since we need to treat
6208          each COND_EXPR in the chain specially, not just the last one.
6209          E.g. for:
6210
6211             x_1 = PHI <x_3, ...>
6212             x_2 = a_2 ? ... : x_1;
6213             x_3 = a_3 ? ... : x_2;
6214
6215          we're interested in the last element in x_3 for which a_2 || a_3
6216          is true, whereas the current reduction chain handling would
6217          vectorize x_2 as a normal VEC_COND_EXPR and only treat x_3
6218          as a reduction operation.  */
6219       if (reduc_index == -1)
6220         {
6221           if (dump_enabled_p ())
6222             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6223                              "conditional reduction chains not supported\n");
6224           return false;
6225         }
6226
6227       if (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6228                                           vectype_in, OPTIMIZE_FOR_SPEED))
6229         {
6230           if (dump_enabled_p ())
6231             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6232                              "optimizing condition reduction with"
6233                              " FOLD_EXTRACT_LAST.\n");
6234           STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = EXTRACT_LAST_REDUCTION;
6235         }
6236       else if (cond_reduc_dt == vect_induction_def)
6237         {
6238           tree base
6239             = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6240           tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6241
6242           gcc_assert (TREE_CODE (base) == INTEGER_CST
6243                       && TREE_CODE (step) == INTEGER_CST);
6244           cond_reduc_val = NULL_TREE;
6245           enum tree_code cond_reduc_op_code = ERROR_MARK;
6246           tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
6247           if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
6248             ;
6249           /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6250              above base; punt if base is the minimum value of the type for
6251              MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
6252           else if (tree_int_cst_sgn (step) == -1)
6253             {
6254               cond_reduc_op_code = MIN_EXPR;
6255               if (tree_int_cst_sgn (base) == -1)
6256                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6257               else if (tree_int_cst_lt (base,
6258                                         TYPE_MAX_VALUE (TREE_TYPE (base))))
6259                 cond_reduc_val
6260                   = int_const_binop (PLUS_EXPR, base, integer_one_node);
6261             }
6262           else
6263             {
6264               cond_reduc_op_code = MAX_EXPR;
6265               if (tree_int_cst_sgn (base) == 1)
6266                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6267               else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6268                                         base))
6269                 cond_reduc_val
6270                   = int_const_binop (MINUS_EXPR, base, integer_one_node);
6271             }
6272           if (cond_reduc_val)
6273             {
6274               STMT_VINFO_VEC_COND_REDUC_CODE (stmt_info)
6275                 = cond_reduc_op_code;
6276               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (stmt_info)
6277                 = cond_reduc_val;
6278               STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6279                 = INTEGER_INDUC_COND_REDUCTION;
6280             }
6281         }
6282       else if (cond_reduc_dt == vect_constant_def)
6283         {
6284           enum vect_def_type cond_initial_dt;
6285           gimple *def_stmt = SSA_NAME_DEF_STMT (ops[reduc_index]);
6286           tree cond_initial_val
6287             = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
6288
6289           gcc_assert (cond_reduc_val != NULL_TREE);
6290           vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
6291           if (cond_initial_dt == vect_constant_def
6292               && types_compatible_p (TREE_TYPE (cond_initial_val),
6293                                      TREE_TYPE (cond_reduc_val)))
6294             {
6295               tree e = fold_binary (LE_EXPR, boolean_type_node,
6296                                     cond_initial_val, cond_reduc_val);
6297               if (e && (integer_onep (e) || integer_zerop (e)))
6298                 {
6299                   if (dump_enabled_p ())
6300                     dump_printf_loc (MSG_NOTE, vect_location,
6301                                      "condition expression based on "
6302                                      "compile time constant.\n");
6303                   /* Record reduction code at analysis stage.  */
6304                   STMT_VINFO_VEC_COND_REDUC_CODE (stmt_info)
6305                     = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6306                   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6307                     = CONST_COND_REDUCTION;
6308                 }
6309             }
6310         }
6311     }
6312   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == INTEGER_INDUC_COND_REDUCTION
6313       && dump_enabled_p ())
6314     dump_printf_loc (MSG_NOTE, vect_location,
6315                      "condition expression based on "
6316                      "integer induction.\n");
6317
6318   if (orig_stmt_info)
6319     gcc_assert (tmp == orig_stmt_info
6320                 || REDUC_GROUP_FIRST_ELEMENT (tmp) == orig_stmt_info);
6321   else
6322     /* We changed STMT to be the first stmt in reduction chain, hence we
6323        check that in this case the first element in the chain is STMT.  */
6324     gcc_assert (tmp == stmt_info
6325                 || REDUC_GROUP_FIRST_ELEMENT (tmp) == stmt_info);
6326
6327   if (STMT_VINFO_LIVE_P (reduc_def_info))
6328     return false;
6329
6330   if (slp_node)
6331     ncopies = 1;
6332   else
6333     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6334
6335   gcc_assert (ncopies >= 1);
6336
6337   vec_mode = TYPE_MODE (vectype_in);
6338   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6339
6340   if (nested_cycle)
6341     {
6342       basic_block def_bb = gimple_bb (reduc_def_phi);
6343       class loop *def_stmt_loop = def_bb->loop_father;
6344       tree def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
6345                                             loop_preheader_edge (def_stmt_loop));
6346       stmt_vec_info def_arg_stmt_info = loop_vinfo->lookup_def (def_arg);
6347       if (def_arg_stmt_info
6348           && (STMT_VINFO_DEF_TYPE (def_arg_stmt_info)
6349               == vect_double_reduction_def))
6350         double_reduc = true;
6351       gcc_assert (!double_reduc || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_outer_by_reduction);
6352     }
6353
6354   vect_reduction_type reduction_type
6355     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
6356   if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
6357       && ncopies > 1)
6358     {
6359       if (dump_enabled_p ())
6360         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6361                          "multiple types in double reduction or condition "
6362                          "reduction.\n");
6363       return false;
6364     }
6365
6366   if (code == COND_EXPR)
6367     {
6368       /* Only call during the analysis stage, otherwise we'll lose
6369          STMT_VINFO_TYPE.  */
6370       gcc_assert (nested_cycle || reduc_index > 0);
6371       if (!vec_stmt && !vectorizable_condition (stmt_info, gsi, NULL,
6372                                                 true, reduc_index,
6373                                                 NULL, cost_vec))
6374         {
6375           if (dump_enabled_p ())
6376             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6377                              "unsupported condition in reduction\n");
6378           return false;
6379         }
6380     }
6381   else if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
6382            || code == LROTATE_EXPR || code == RROTATE_EXPR)
6383     {
6384       /* Only call during the analysis stage, otherwise we'll lose
6385          STMT_VINFO_TYPE.  We only support this for nested cycles
6386          without double reductions at the moment.  */
6387       if (!nested_cycle
6388           || double_reduc
6389           || (!vec_stmt && !vectorizable_shift (stmt_info, gsi, NULL,
6390                                                 NULL, cost_vec)))
6391         {
6392           if (dump_enabled_p ())
6393             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6394                              "unsupported shift or rotation in reduction\n");
6395           return false;
6396         }
6397     }
6398   else
6399     {
6400       /* 4. Supportable by target?  */
6401
6402       /* 4.1. check support for the operation in the loop  */
6403       optab = optab_for_tree_code (code, vectype_in, optab_default);
6404       if (!optab)
6405         {
6406           if (dump_enabled_p ())
6407             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6408                              "no optab.\n");
6409
6410           return false;
6411         }
6412
6413       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6414         {
6415           if (dump_enabled_p ())
6416             dump_printf (MSG_NOTE, "op not supported by target.\n");
6417
6418           if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
6419               || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6420             return false;
6421
6422           if (dump_enabled_p ())
6423             dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6424         }
6425
6426       /* Worthwhile without SIMD support?  */
6427       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
6428           && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6429         {
6430           if (dump_enabled_p ())
6431             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6432                              "not worthwhile without SIMD support.\n");
6433
6434           return false;
6435         }
6436     }
6437
6438   /* 4.2. Check support for the epilog operation.
6439
6440           If STMT represents a reduction pattern, then the type of the
6441           reduction variable may be different than the type of the rest
6442           of the arguments.  For example, consider the case of accumulation
6443           of shorts into an int accumulator; The original code:
6444                         S1: int_a = (int) short_a;
6445           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
6446
6447           was replaced with:
6448                         STMT: int_acc = widen_sum <short_a, int_acc>
6449
6450           This means that:
6451           1. The tree-code that is used to create the vector operation in the
6452              epilog code (that reduces the partial results) is not the
6453              tree-code of STMT, but is rather the tree-code of the original
6454              stmt from the pattern that STMT is replacing.  I.e, in the example
6455              above we want to use 'widen_sum' in the loop, but 'plus' in the
6456              epilog.
6457           2. The type (mode) we use to check available target support
6458              for the vector operation to be created in the *epilog*, is
6459              determined by the type of the reduction variable (in the example
6460              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6461              However the type (mode) we use to check available target support
6462              for the vector operation to be created *inside the loop*, is
6463              determined by the type of the other arguments to STMT (in the
6464              example we'd check this: optab_handler (widen_sum_optab,
6465              vect_short_mode)).
6466
6467           This is contrary to "regular" reductions, in which the types of all
6468           the arguments are the same as the type of the reduction variable.
6469           For "regular" reductions we can therefore use the same vector type
6470           (and also the same tree-code) when generating the epilog code and
6471           when generating the code inside the loop.  */
6472
6473   enum tree_code orig_code;
6474   if (orig_stmt_info
6475       && (reduction_type == TREE_CODE_REDUCTION
6476           || reduction_type == FOLD_LEFT_REDUCTION))
6477     {
6478       /* This is a reduction pattern: get the vectype from the type of the
6479          reduction variable, and get the tree-code from orig_stmt.  */
6480       orig_code = gimple_assign_rhs_code (orig_stmt_info->stmt);
6481       gcc_assert (vectype_out);
6482       vec_mode = TYPE_MODE (vectype_out);
6483     }
6484   else
6485     {
6486       /* Regular reduction: use the same vectype and tree-code as used for
6487          the vector code inside the loop can be used for the epilog code. */
6488       orig_code = code;
6489
6490       if (code == MINUS_EXPR)
6491         orig_code = PLUS_EXPR;
6492
6493       /* For simple condition reductions, replace with the actual expression
6494          we want to base our reduction around.  */
6495       if (reduction_type == CONST_COND_REDUCTION
6496           || reduction_type == INTEGER_INDUC_COND_REDUCTION)
6497         {
6498           orig_code = STMT_VINFO_VEC_COND_REDUC_CODE (stmt_info);
6499           gcc_assert (orig_code == MAX_EXPR || orig_code == MIN_EXPR);
6500         }
6501     }
6502   STMT_VINFO_REDUC_CODE (stmt_info) = orig_code;
6503
6504   reduc_fn = IFN_LAST;
6505
6506   if (reduction_type == TREE_CODE_REDUCTION
6507       || reduction_type == FOLD_LEFT_REDUCTION
6508       || reduction_type == INTEGER_INDUC_COND_REDUCTION
6509       || reduction_type == CONST_COND_REDUCTION)
6510     {
6511       if (reduction_type == FOLD_LEFT_REDUCTION
6512           ? fold_left_reduction_fn (orig_code, &reduc_fn)
6513           : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6514         {
6515           if (reduc_fn != IFN_LAST
6516               && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6517                                                   OPTIMIZE_FOR_SPEED))
6518             {
6519               if (dump_enabled_p ())
6520                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6521                                  "reduc op not supported by target.\n");
6522
6523               reduc_fn = IFN_LAST;
6524             }
6525         }
6526       else
6527         {
6528           if (!nested_cycle || double_reduc)
6529             {
6530               if (dump_enabled_p ())
6531                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6532                                  "no reduc code for scalar code.\n");
6533
6534               return false;
6535             }
6536         }
6537     }
6538   else if (reduction_type == COND_REDUCTION)
6539     {
6540       int scalar_precision
6541         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6542       cr_index_scalar_type = make_unsigned_type (scalar_precision);
6543       cr_index_vector_type = build_vector_type (cr_index_scalar_type,
6544                                                 nunits_out);
6545
6546       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
6547                                           OPTIMIZE_FOR_SPEED))
6548         reduc_fn = IFN_REDUC_MAX;
6549     }
6550   STMT_VINFO_REDUC_FN (stmt_info) = reduc_fn;
6551
6552   if (reduction_type != EXTRACT_LAST_REDUCTION
6553       && (!nested_cycle || double_reduc)
6554       && reduc_fn == IFN_LAST
6555       && !nunits_out.is_constant ())
6556     {
6557       if (dump_enabled_p ())
6558         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6559                          "missing target support for reduction on"
6560                          " variable-length vectors.\n");
6561       return false;
6562     }
6563
6564   /* For SLP reductions, see if there is a neutral value we can use.  */
6565   tree neutral_op = NULL_TREE;
6566   if (slp_node)
6567     neutral_op = neutral_op_for_slp_reduction
6568       (slp_node_instance->reduc_phis, code,
6569        REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL);
6570
6571   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
6572     {
6573       /* We can't support in-order reductions of code such as this:
6574
6575            for (int i = 0; i < n1; ++i)
6576              for (int j = 0; j < n2; ++j)
6577                l += a[j];
6578
6579          since GCC effectively transforms the loop when vectorizing:
6580
6581            for (int i = 0; i < n1 / VF; ++i)
6582              for (int j = 0; j < n2; ++j)
6583                for (int k = 0; k < VF; ++k)
6584                  l += a[j];
6585
6586          which is a reassociation of the original operation.  */
6587       if (dump_enabled_p ())
6588         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6589                          "in-order double reduction not supported.\n");
6590
6591       return false;
6592     }
6593
6594   if (reduction_type == FOLD_LEFT_REDUCTION
6595       && slp_node
6596       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6597     {
6598       /* We cannot use in-order reductions in this case because there is
6599          an implicit reassociation of the operations involved.  */
6600       if (dump_enabled_p ())
6601         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6602                          "in-order unchained SLP reductions not supported.\n");
6603       return false;
6604     }
6605
6606   /* For double reductions, and for SLP reductions with a neutral value,
6607      we construct a variable-length initial vector by loading a vector
6608      full of the neutral value and then shift-and-inserting the start
6609      values into the low-numbered elements.  */
6610   if ((double_reduc || neutral_op)
6611       && !nunits_out.is_constant ()
6612       && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
6613                                           vectype_out, OPTIMIZE_FOR_SPEED))
6614     {
6615       if (dump_enabled_p ())
6616         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6617                          "reduction on variable-length vectors requires"
6618                          " target support for a vector-shift-and-insert"
6619                          " operation.\n");
6620       return false;
6621     }
6622
6623   /* Check extra constraints for variable-length unchained SLP reductions.  */
6624   if (STMT_SLP_TYPE (stmt_info)
6625       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6626       && !nunits_out.is_constant ())
6627     {
6628       /* We checked above that we could build the initial vector when
6629          there's a neutral element value.  Check here for the case in
6630          which each SLP statement has its own initial value and in which
6631          that value needs to be repeated for every instance of the
6632          statement within the initial vector.  */
6633       unsigned int group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6634       scalar_mode elt_mode = SCALAR_TYPE_MODE (TREE_TYPE (vectype_out));
6635       if (!neutral_op
6636           && !can_duplicate_and_interleave_p (group_size, elt_mode))
6637         {
6638           if (dump_enabled_p ())
6639             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6640                              "unsupported form of SLP reduction for"
6641                              " variable-length vectors: cannot build"
6642                              " initial vector.\n");
6643           return false;
6644         }
6645       /* The epilogue code relies on the number of elements being a multiple
6646          of the group size.  The duplicate-and-interleave approach to setting
6647          up the the initial vector does too.  */
6648       if (!multiple_p (nunits_out, group_size))
6649         {
6650           if (dump_enabled_p ())
6651             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6652                              "unsupported form of SLP reduction for"
6653                              " variable-length vectors: the vector size"
6654                              " is not a multiple of the number of results.\n");
6655           return false;
6656         }
6657     }
6658
6659   /* In case of widenning multiplication by a constant, we update the type
6660      of the constant to be the type of the other operand.  We check that the
6661      constant fits the type in the pattern recognition pass.  */
6662   if (code == DOT_PROD_EXPR
6663       && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
6664     {
6665       gcc_unreachable ();
6666       /* No testcase for this.  PR49478.  */
6667       if (TREE_CODE (ops[0]) == INTEGER_CST)
6668         ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
6669       else if (TREE_CODE (ops[1]) == INTEGER_CST)
6670         ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
6671       else
6672         {
6673           if (dump_enabled_p ())
6674             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6675                              "invalid types in dot-prod\n");
6676
6677           return false;
6678         }
6679     }
6680
6681   if (reduction_type == COND_REDUCTION)
6682     {
6683       widest_int ni;
6684
6685       if (! max_loop_iterations (loop, &ni))
6686         {
6687           if (dump_enabled_p ())
6688             dump_printf_loc (MSG_NOTE, vect_location,
6689                              "loop count not known, cannot create cond "
6690                              "reduction.\n");
6691           return false;
6692         }
6693       /* Convert backedges to iterations.  */
6694       ni += 1;
6695
6696       /* The additional index will be the same type as the condition.  Check
6697          that the loop can fit into this less one (because we'll use up the
6698          zero slot for when there are no matches).  */
6699       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
6700       if (wi::geu_p (ni, wi::to_widest (max_index)))
6701         {
6702           if (dump_enabled_p ())
6703             dump_printf_loc (MSG_NOTE, vect_location,
6704                              "loop size is greater than data size.\n");
6705           return false;
6706         }
6707     }
6708
6709   /* In case the vectorization factor (VF) is bigger than the number
6710      of elements that we can fit in a vectype (nunits), we have to generate
6711      more than one vector stmt - i.e - we need to "unroll" the
6712      vector stmt by a factor VF/nunits.  For more details see documentation
6713      in vectorizable_operation.  */
6714
6715   /* If the reduction is used in an outer loop we need to generate
6716      VF intermediate results, like so (e.g. for ncopies=2):
6717         r0 = phi (init, r0)
6718         r1 = phi (init, r1)
6719         r0 = x0 + r0;
6720         r1 = x1 + r1;
6721     (i.e. we generate VF results in 2 registers).
6722     In this case we have a separate def-use cycle for each copy, and therefore
6723     for each copy we get the vector def for the reduction variable from the
6724     respective phi node created for this copy.
6725
6726     Otherwise (the reduction is unused in the loop nest), we can combine
6727     together intermediate results, like so (e.g. for ncopies=2):
6728         r = phi (init, r)
6729         r = x0 + r;
6730         r = x1 + r;
6731    (i.e. we generate VF/2 results in a single register).
6732    In this case for each copy we get the vector def for the reduction variable
6733    from the vectorized reduction operation generated in the previous iteration.
6734
6735    This only works when we see both the reduction PHI and its only consumer
6736    in vectorizable_reduction and there are no intermediate stmts
6737    participating.  */
6738   stmt_vec_info use_stmt_info;
6739   tree reduc_phi_result = gimple_phi_result (reduc_def_phi);
6740   if (ncopies > 1
6741       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
6742       && (use_stmt_info = loop_vinfo->lookup_single_use (reduc_phi_result))
6743       && (!STMT_VINFO_IN_PATTERN_P (use_stmt_info)
6744           || !STMT_VINFO_PATTERN_DEF_SEQ (use_stmt_info))
6745       && vect_stmt_to_vectorize (use_stmt_info) == stmt_info)
6746     single_defuse_cycle = true;
6747
6748   /* If the reduction stmt is one of the patterns that have lane
6749      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
6750   if ((ncopies > 1
6751        && ! single_defuse_cycle)
6752       && (code == DOT_PROD_EXPR
6753           || code == WIDEN_SUM_EXPR
6754           || code == SAD_EXPR))
6755     {
6756       if (dump_enabled_p ())
6757         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6758                          "multi def-use cycle not possible for lane-reducing "
6759                          "reduction operation\n");
6760       return false;
6761     }
6762
6763   if (slp_node)
6764     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6765   else
6766     vec_num = 1;
6767
6768   internal_fn cond_fn = get_conditional_internal_fn (code);
6769   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
6770   bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
6771
6772   if (!vec_stmt) /* transformation not required.  */
6773     {
6774       vect_model_reduction_cost (stmt_info, reduc_fn, ncopies, cost_vec);
6775       if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
6776         {
6777           if (reduction_type != FOLD_LEFT_REDUCTION
6778               && !mask_by_cond_expr
6779               && (cond_fn == IFN_LAST
6780                   || !direct_internal_fn_supported_p (cond_fn, vectype_in,
6781                                                       OPTIMIZE_FOR_SPEED)))
6782             {
6783               if (dump_enabled_p ())
6784                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6785                                  "can't use a fully-masked loop because no"
6786                                  " conditional operation is available.\n");
6787               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
6788             }
6789           else if (reduc_index == -1)
6790             {
6791               if (dump_enabled_p ())
6792                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6793                                  "can't use a fully-masked loop for chained"
6794                                  " reductions.\n");
6795               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
6796             }
6797           else
6798             vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
6799                                    vectype_in);
6800         }
6801       if (dump_enabled_p ()
6802           && reduction_type == FOLD_LEFT_REDUCTION)
6803         dump_printf_loc (MSG_NOTE, vect_location,
6804                          "using an in-order (fold-left) reduction.\n");
6805       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6806       return true;
6807     }
6808
6809
6810   /* Transform.  */
6811   stmt_vec_info new_stmt_info = NULL;
6812   stmt_vec_info prev_stmt_info;
6813   tree new_temp = NULL_TREE;
6814   auto_vec<tree> vec_oprnds0;
6815   auto_vec<tree> vec_oprnds1;
6816   auto_vec<tree> vec_oprnds2;
6817   tree def0;
6818
6819   if (dump_enabled_p ())
6820     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
6821
6822   /* FORNOW: Multiple types are not supported for condition.  */
6823   if (code == COND_EXPR)
6824     gcc_assert (ncopies == 1);
6825
6826   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
6827
6828   if (reduction_type == FOLD_LEFT_REDUCTION)
6829     return vectorize_fold_left_reduction
6830       (stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, code,
6831        reduc_fn, ops, vectype_in, reduc_index, masks);
6832
6833   if (reduction_type == EXTRACT_LAST_REDUCTION)
6834     {
6835       gcc_assert (!slp_node && reduc_index > 0);
6836       return vectorizable_condition (stmt_info, gsi, vec_stmt,
6837                                      true, reduc_index, NULL, NULL);
6838     }
6839
6840   /* Create the destination vector  */
6841   tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6842
6843   prev_stmt_info = NULL;
6844   prev_phi_info = NULL;
6845   if (!slp_node)
6846     {
6847       vec_oprnds0.create (1);
6848       vec_oprnds1.create (1);
6849       if (op_type == ternary_op)
6850         vec_oprnds2.create (1);
6851     }
6852
6853   for (j = 0; j < ncopies; j++)
6854     {
6855       if (code == COND_EXPR)
6856         {
6857           gcc_assert (!slp_node && (nested_cycle || reduc_index > 0));
6858           vectorizable_condition (stmt_info, gsi, vec_stmt, true,
6859                                   reduc_index, NULL, NULL);
6860           break;
6861         }
6862       if (code == LSHIFT_EXPR
6863           || code == RSHIFT_EXPR)
6864         {
6865           vectorizable_shift (stmt_info, gsi, vec_stmt, slp_node, NULL);
6866           break;
6867         }
6868
6869       /* Handle uses.  */
6870       if (j == 0)
6871         {
6872           if (slp_node)
6873             {
6874               /* Get vec defs for all the operands except the reduction index,
6875                  ensuring the ordering of the ops in the vector is kept.  */
6876               auto_vec<tree, 3> slp_ops;
6877               auto_vec<vec<tree>, 3> vec_defs;
6878
6879               slp_ops.quick_push (ops[0]);
6880               slp_ops.quick_push (ops[1]);
6881               if (op_type == ternary_op)
6882                 slp_ops.quick_push (ops[2]);
6883
6884               vect_get_slp_defs (slp_ops, slp_node, &vec_defs);
6885
6886               vec_oprnds0.safe_splice (vec_defs[0]);
6887               vec_defs[0].release ();
6888               vec_oprnds1.safe_splice (vec_defs[1]);
6889               vec_defs[1].release ();
6890               if (op_type == ternary_op)
6891                 {
6892                   vec_oprnds2.safe_splice (vec_defs[2]);
6893                   vec_defs[2].release ();
6894                 }
6895             }
6896           else
6897             {
6898               vec_oprnds0.quick_push
6899                 (vect_get_vec_def_for_operand (ops[0], stmt_info));
6900               vec_oprnds1.quick_push
6901                 (vect_get_vec_def_for_operand (ops[1], stmt_info));
6902               if (op_type == ternary_op)
6903                 vec_oprnds2.quick_push
6904                   (vect_get_vec_def_for_operand (ops[2], stmt_info));
6905             }
6906         }
6907       else
6908         {
6909           if (!slp_node)
6910             {
6911               gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
6912
6913               if (single_defuse_cycle && reduc_index == 0)
6914                 vec_oprnds0[0] = gimple_get_lhs (new_stmt_info->stmt);
6915               else
6916                 vec_oprnds0[0]
6917                   = vect_get_vec_def_for_stmt_copy (loop_vinfo,
6918                                                     vec_oprnds0[0]);
6919               if (single_defuse_cycle && reduc_index == 1)
6920                 vec_oprnds1[0] = gimple_get_lhs (new_stmt_info->stmt);
6921               else
6922                 vec_oprnds1[0]
6923                   = vect_get_vec_def_for_stmt_copy (loop_vinfo,
6924                                                     vec_oprnds1[0]);
6925               if (op_type == ternary_op)
6926                 {
6927                   if (single_defuse_cycle && reduc_index == 2)
6928                     vec_oprnds2[0] = gimple_get_lhs (new_stmt_info->stmt);
6929                   else
6930                     vec_oprnds2[0]
6931                       = vect_get_vec_def_for_stmt_copy (loop_vinfo,
6932                                                         vec_oprnds2[0]);
6933                 }
6934             }
6935         }
6936
6937       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6938         {
6939           tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
6940           if (masked_loop_p && !mask_by_cond_expr)
6941             {
6942               /* Make sure that the reduction accumulator is vop[0].  */
6943               if (reduc_index == 1)
6944                 {
6945                   gcc_assert (commutative_tree_code (code));
6946                   std::swap (vop[0], vop[1]);
6947                 }
6948               tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
6949                                               vectype_in, i * ncopies + j);
6950               gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
6951                                                         vop[0], vop[1],
6952                                                         vop[0]);
6953               new_temp = make_ssa_name (vec_dest, call);
6954               gimple_call_set_lhs (call, new_temp);
6955               gimple_call_set_nothrow (call, true);
6956               new_stmt_info
6957                 = vect_finish_stmt_generation (stmt_info, call, gsi);
6958             }
6959           else
6960             {
6961               if (op_type == ternary_op)
6962                 vop[2] = vec_oprnds2[i];
6963
6964               if (masked_loop_p && mask_by_cond_expr)
6965                 {
6966                   tree mask = vect_get_loop_mask (gsi, masks,
6967                                                   vec_num * ncopies,
6968                                                   vectype_in, i * ncopies + j);
6969                   build_vect_cond_expr (code, vop, mask, gsi);
6970                 }
6971
6972               gassign *new_stmt = gimple_build_assign (vec_dest, code,
6973                                                        vop[0], vop[1], vop[2]);
6974               new_temp = make_ssa_name (vec_dest, new_stmt);
6975               gimple_assign_set_lhs (new_stmt, new_temp);
6976               new_stmt_info
6977                 = vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
6978             }
6979
6980           if (slp_node)
6981             SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
6982         }
6983
6984       if (slp_node || single_defuse_cycle)
6985         continue;
6986
6987       if (j == 0)
6988         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
6989       else
6990         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
6991
6992       prev_stmt_info = new_stmt_info;
6993     }
6994
6995   if (single_defuse_cycle && !slp_node)
6996     STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
6997
6998   return true;
6999 }
7000
7001 /* Vectorizes LC PHIs of nested cycles (sofar).  */
7002
7003 bool
7004 vectorizable_lc_phi (stmt_vec_info stmt_info, stmt_vec_info *vec_stmt,
7005                      slp_tree slp_node)
7006 {
7007   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7008   if (!loop_vinfo
7009       || !is_a <gphi *> (stmt_info->stmt)
7010       || gimple_phi_num_args (stmt_info->stmt) != 1)
7011     return false;
7012
7013   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
7014       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
7015     return false;
7016
7017   if (!vec_stmt) /* transformation not required.  */
7018     {
7019       STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
7020       return true;
7021     }
7022
7023   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7024   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
7025   basic_block bb = gimple_bb (stmt_info->stmt);
7026   edge e = single_pred_edge (bb);
7027   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
7028   vec<tree> vec_oprnds = vNULL;
7029   vect_get_vec_defs (gimple_phi_arg_def (stmt_info->stmt, 0), NULL_TREE,
7030                      stmt_info, &vec_oprnds, NULL, slp_node);
7031   if (slp_node)
7032     {
7033       unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7034       gcc_assert (vec_oprnds.length () == vec_num);
7035       for (unsigned i = 0; i < vec_num; i++)
7036         {
7037           /* Create the vectorized LC PHI node.  */
7038           gphi *new_phi = create_phi_node (vec_dest, bb);
7039           add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
7040           stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
7041           SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi_info);
7042         }
7043     }
7044   else
7045     {
7046       unsigned ncopies = vect_get_num_copies (loop_vinfo, vectype);
7047       stmt_vec_info prev_phi_info = NULL;
7048       for (unsigned i = 0; i < ncopies; i++)
7049         {
7050           if (i != 0)
7051             vect_get_vec_defs_for_stmt_copy (loop_vinfo, &vec_oprnds, NULL);
7052           /* Create the vectorized LC PHI node.  */
7053           gphi *new_phi = create_phi_node (vec_dest, bb);
7054           add_phi_arg (new_phi, vec_oprnds[0], e, UNKNOWN_LOCATION);
7055           stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
7056           if (i == 0)
7057             STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi_info;
7058           else
7059             STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi_info;
7060           prev_phi_info = new_phi_info;
7061         }
7062     }
7063   vec_oprnds.release ();
7064
7065   return true;
7066 }
7067
7068
7069 /* Function vect_min_worthwhile_factor.
7070
7071    For a loop where we could vectorize the operation indicated by CODE,
7072    return the minimum vectorization factor that makes it worthwhile
7073    to use generic vectors.  */
7074 static unsigned int
7075 vect_min_worthwhile_factor (enum tree_code code)
7076 {
7077   switch (code)
7078     {
7079     case PLUS_EXPR:
7080     case MINUS_EXPR:
7081     case NEGATE_EXPR:
7082       return 4;
7083
7084     case BIT_AND_EXPR:
7085     case BIT_IOR_EXPR:
7086     case BIT_XOR_EXPR:
7087     case BIT_NOT_EXPR:
7088       return 2;
7089
7090     default:
7091       return INT_MAX;
7092     }
7093 }
7094
7095 /* Return true if VINFO indicates we are doing loop vectorization and if
7096    it is worth decomposing CODE operations into scalar operations for
7097    that loop's vectorization factor.  */
7098
7099 bool
7100 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
7101 {
7102   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7103   unsigned HOST_WIDE_INT value;
7104   return (loop_vinfo
7105           && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7106           && value >= vect_min_worthwhile_factor (code));
7107 }
7108
7109 /* Function vectorizable_induction
7110
7111    Check if STMT_INFO performs an induction computation that can be vectorized.
7112    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7113    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7114    Return true if STMT_INFO is vectorizable in this way.  */
7115
7116 bool
7117 vectorizable_induction (stmt_vec_info stmt_info,
7118                         gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7119                         stmt_vec_info *vec_stmt, slp_tree slp_node,
7120                         stmt_vector_for_cost *cost_vec)
7121 {
7122   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7123   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7124   unsigned ncopies;
7125   bool nested_in_vect_loop = false;
7126   class loop *iv_loop;
7127   tree vec_def;
7128   edge pe = loop_preheader_edge (loop);
7129   basic_block new_bb;
7130   tree new_vec, vec_init, vec_step, t;
7131   tree new_name;
7132   gimple *new_stmt;
7133   gphi *induction_phi;
7134   tree induc_def, vec_dest;
7135   tree init_expr, step_expr;
7136   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7137   unsigned i;
7138   tree expr;
7139   gimple_seq stmts;
7140   imm_use_iterator imm_iter;
7141   use_operand_p use_p;
7142   gimple *exit_phi;
7143   edge latch_e;
7144   tree loop_arg;
7145   gimple_stmt_iterator si;
7146
7147   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
7148   if (!phi)
7149     return false;
7150
7151   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7152     return false;
7153
7154   /* Make sure it was recognized as induction computation.  */
7155   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7156     return false;
7157
7158   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7159   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7160
7161   if (slp_node)
7162     ncopies = 1;
7163   else
7164     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7165   gcc_assert (ncopies >= 1);
7166
7167   /* FORNOW. These restrictions should be relaxed.  */
7168   if (nested_in_vect_loop_p (loop, stmt_info))
7169     {
7170       imm_use_iterator imm_iter;
7171       use_operand_p use_p;
7172       gimple *exit_phi;
7173       edge latch_e;
7174       tree loop_arg;
7175
7176       if (ncopies > 1)
7177         {
7178           if (dump_enabled_p ())
7179             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7180                              "multiple types in nested loop.\n");
7181           return false;
7182         }
7183
7184       /* FORNOW: outer loop induction with SLP not supported.  */
7185       if (STMT_SLP_TYPE (stmt_info))
7186         return false;
7187
7188       exit_phi = NULL;
7189       latch_e = loop_latch_edge (loop->inner);
7190       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7191       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7192         {
7193           gimple *use_stmt = USE_STMT (use_p);
7194           if (is_gimple_debug (use_stmt))
7195             continue;
7196
7197           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7198             {
7199               exit_phi = use_stmt;
7200               break;
7201             }
7202         }
7203       if (exit_phi)
7204         {
7205           stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7206           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7207                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7208             {
7209               if (dump_enabled_p ())
7210                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7211                                  "inner-loop induction only used outside "
7212                                  "of the outer vectorized loop.\n");
7213               return false;
7214             }
7215         }
7216
7217       nested_in_vect_loop = true;
7218       iv_loop = loop->inner;
7219     }
7220   else
7221     iv_loop = loop;
7222   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7223
7224   if (slp_node && !nunits.is_constant ())
7225     {
7226       /* The current SLP code creates the initial value element-by-element.  */
7227       if (dump_enabled_p ())
7228         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7229                          "SLP induction not supported for variable-length"
7230                          " vectors.\n");
7231       return false;
7232     }
7233
7234   if (!vec_stmt) /* transformation not required.  */
7235     {
7236       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
7237       DUMP_VECT_SCOPE ("vectorizable_induction");
7238       vect_model_induction_cost (stmt_info, ncopies, cost_vec);
7239       return true;
7240     }
7241
7242   /* Transform.  */
7243
7244   /* Compute a vector variable, initialized with the first VF values of
7245      the induction variable.  E.g., for an iv with IV_PHI='X' and
7246      evolution S, for a vector of 4 units, we want to compute:
7247      [X, X + S, X + 2*S, X + 3*S].  */
7248
7249   if (dump_enabled_p ())
7250     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
7251
7252   latch_e = loop_latch_edge (iv_loop);
7253   loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7254
7255   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
7256   gcc_assert (step_expr != NULL_TREE);
7257   tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
7258
7259   pe = loop_preheader_edge (iv_loop);
7260   init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
7261                                      loop_preheader_edge (iv_loop));
7262
7263   stmts = NULL;
7264   if (!nested_in_vect_loop)
7265     {
7266       /* Convert the initial value to the IV update type.  */
7267       tree new_type = TREE_TYPE (step_expr);
7268       init_expr = gimple_convert (&stmts, new_type, init_expr);
7269
7270       /* If we are using the loop mask to "peel" for alignment then we need
7271          to adjust the start value here.  */
7272       tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
7273       if (skip_niters != NULL_TREE)
7274         {
7275           if (FLOAT_TYPE_P (vectype))
7276             skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
7277                                         skip_niters);
7278           else
7279             skip_niters = gimple_convert (&stmts, new_type, skip_niters);
7280           tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
7281                                          skip_niters, step_expr);
7282           init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
7283                                     init_expr, skip_step);
7284         }
7285     }
7286
7287   if (stmts)
7288     {
7289       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7290       gcc_assert (!new_bb);
7291     }
7292
7293   /* Find the first insertion point in the BB.  */
7294   basic_block bb = gimple_bb (phi);
7295   si = gsi_after_labels (bb);
7296
7297   /* For SLP induction we have to generate several IVs as for example
7298      with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
7299      [i + 2*S, i + 3*S, i + 3*S, i + 3*S].  The step is the same uniform
7300      [VF*S, VF*S, VF*S, VF*S] for all.  */
7301   if (slp_node)
7302     {
7303       /* Enforced above.  */
7304       unsigned int const_nunits = nunits.to_constant ();
7305
7306       /* Generate [VF*S, VF*S, ... ].  */
7307       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7308         {
7309           expr = build_int_cst (integer_type_node, vf);
7310           expr = fold_convert (TREE_TYPE (step_expr), expr);
7311         }
7312       else
7313         expr = build_int_cst (TREE_TYPE (step_expr), vf);
7314       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7315                               expr, step_expr);
7316       if (! CONSTANT_CLASS_P (new_name))
7317         new_name = vect_init_vector (stmt_info, new_name,
7318                                      TREE_TYPE (step_expr), NULL);
7319       new_vec = build_vector_from_val (step_vectype, new_name);
7320       vec_step = vect_init_vector (stmt_info, new_vec, step_vectype, NULL);
7321
7322       /* Now generate the IVs.  */
7323       unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7324       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7325       unsigned elts = const_nunits * nvects;
7326       unsigned nivs = least_common_multiple (group_size,
7327                                              const_nunits) / const_nunits;
7328       gcc_assert (elts % group_size == 0);
7329       tree elt = init_expr;
7330       unsigned ivn;
7331       for (ivn = 0; ivn < nivs; ++ivn)
7332         {
7333           tree_vector_builder elts (step_vectype, const_nunits, 1);
7334           stmts = NULL;
7335           for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
7336             {
7337               if (ivn*const_nunits + eltn >= group_size
7338                   && (ivn * const_nunits + eltn) % group_size == 0)
7339                 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
7340                                     elt, step_expr);
7341               elts.quick_push (elt);
7342             }
7343           vec_init = gimple_build_vector (&stmts, &elts);
7344           vec_init = gimple_convert (&stmts, vectype, vec_init);
7345           if (stmts)
7346             {
7347               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7348               gcc_assert (!new_bb);
7349             }
7350
7351           /* Create the induction-phi that defines the induction-operand.  */
7352           vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7353           induction_phi = create_phi_node (vec_dest, iv_loop->header);
7354           stmt_vec_info induction_phi_info
7355             = loop_vinfo->add_stmt (induction_phi);
7356           induc_def = PHI_RESULT (induction_phi);
7357
7358           /* Create the iv update inside the loop  */
7359           gimple_seq stmts = NULL;
7360           vec_def = gimple_convert (&stmts, step_vectype, induc_def);
7361           vec_def = gimple_build (&stmts,
7362                                   PLUS_EXPR, step_vectype, vec_def, vec_step);
7363           vec_def = gimple_convert (&stmts, vectype, vec_def);
7364           loop_vinfo->add_stmt (SSA_NAME_DEF_STMT (vec_def));
7365           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
7366
7367           /* Set the arguments of the phi node:  */
7368           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7369           add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7370                        UNKNOWN_LOCATION);
7371
7372           SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi_info);
7373         }
7374
7375       /* Re-use IVs when we can.  */
7376       if (ivn < nvects)
7377         {
7378           unsigned vfp
7379             = least_common_multiple (group_size, const_nunits) / group_size;
7380           /* Generate [VF'*S, VF'*S, ... ].  */
7381           if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7382             {
7383               expr = build_int_cst (integer_type_node, vfp);
7384               expr = fold_convert (TREE_TYPE (step_expr), expr);
7385             }
7386           else
7387             expr = build_int_cst (TREE_TYPE (step_expr), vfp);
7388           new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7389                                   expr, step_expr);
7390           if (! CONSTANT_CLASS_P (new_name))
7391             new_name = vect_init_vector (stmt_info, new_name,
7392                                          TREE_TYPE (step_expr), NULL);
7393           new_vec = build_vector_from_val (step_vectype, new_name);
7394           vec_step = vect_init_vector (stmt_info, new_vec, step_vectype, NULL);
7395           for (; ivn < nvects; ++ivn)
7396             {
7397               gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs]->stmt;
7398               tree def;
7399               if (gimple_code (iv) == GIMPLE_PHI)
7400                 def = gimple_phi_result (iv);
7401               else
7402                 def = gimple_assign_lhs (iv);
7403               gimple_seq stmts = NULL;
7404               def = gimple_convert (&stmts, step_vectype, def);
7405               def = gimple_build (&stmts,
7406                                   PLUS_EXPR, step_vectype, def, vec_step);
7407               def = gimple_convert (&stmts, vectype, def);
7408               if (gimple_code (iv) == GIMPLE_PHI)
7409                 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
7410               else
7411                 {
7412                   gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
7413                   gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
7414                 }
7415               SLP_TREE_VEC_STMTS (slp_node).quick_push
7416                 (loop_vinfo->add_stmt (SSA_NAME_DEF_STMT (def)));
7417             }
7418         }
7419
7420       return true;
7421     }
7422
7423   /* Create the vector that holds the initial_value of the induction.  */
7424   if (nested_in_vect_loop)
7425     {
7426       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
7427          been created during vectorization of previous stmts.  We obtain it
7428          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
7429       vec_init = vect_get_vec_def_for_operand (init_expr, stmt_info);
7430       /* If the initial value is not of proper type, convert it.  */
7431       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
7432         {
7433           new_stmt
7434             = gimple_build_assign (vect_get_new_ssa_name (vectype,
7435                                                           vect_simple_var,
7436                                                           "vec_iv_"),
7437                                    VIEW_CONVERT_EXPR,
7438                                    build1 (VIEW_CONVERT_EXPR, vectype,
7439                                            vec_init));
7440           vec_init = gimple_assign_lhs (new_stmt);
7441           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
7442                                                  new_stmt);
7443           gcc_assert (!new_bb);
7444           loop_vinfo->add_stmt (new_stmt);
7445         }
7446     }
7447   else
7448     {
7449       /* iv_loop is the loop to be vectorized. Create:
7450          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
7451       stmts = NULL;
7452       new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
7453
7454       unsigned HOST_WIDE_INT const_nunits;
7455       if (nunits.is_constant (&const_nunits))
7456         {
7457           tree_vector_builder elts (step_vectype, const_nunits, 1);
7458           elts.quick_push (new_name);
7459           for (i = 1; i < const_nunits; i++)
7460             {
7461               /* Create: new_name_i = new_name + step_expr  */
7462               new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
7463                                        new_name, step_expr);
7464               elts.quick_push (new_name);
7465             }
7466           /* Create a vector from [new_name_0, new_name_1, ...,
7467              new_name_nunits-1]  */
7468           vec_init = gimple_build_vector (&stmts, &elts);
7469         }
7470       else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
7471         /* Build the initial value directly from a VEC_SERIES_EXPR.  */
7472         vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
7473                                  new_name, step_expr);
7474       else
7475         {
7476           /* Build:
7477                 [base, base, base, ...]
7478                 + (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
7479           gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
7480           gcc_assert (flag_associative_math);
7481           tree index = build_index_vector (step_vectype, 0, 1);
7482           tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
7483                                                         new_name);
7484           tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
7485                                                         step_expr);
7486           vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
7487           vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
7488                                    vec_init, step_vec);
7489           vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
7490                                    vec_init, base_vec);
7491         }
7492       vec_init = gimple_convert (&stmts, vectype, vec_init);
7493
7494       if (stmts)
7495         {
7496           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7497           gcc_assert (!new_bb);
7498         }
7499     }
7500
7501
7502   /* Create the vector that holds the step of the induction.  */
7503   if (nested_in_vect_loop)
7504     /* iv_loop is nested in the loop to be vectorized. Generate:
7505        vec_step = [S, S, S, S]  */
7506     new_name = step_expr;
7507   else
7508     {
7509       /* iv_loop is the loop to be vectorized. Generate:
7510           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
7511       gimple_seq seq = NULL;
7512       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7513         {
7514           expr = build_int_cst (integer_type_node, vf);
7515           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7516         }
7517       else
7518         expr = build_int_cst (TREE_TYPE (step_expr), vf);
7519       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7520                                expr, step_expr);
7521       if (seq)
7522         {
7523           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7524           gcc_assert (!new_bb);
7525         }
7526     }
7527
7528   t = unshare_expr (new_name);
7529   gcc_assert (CONSTANT_CLASS_P (new_name)
7530               || TREE_CODE (new_name) == SSA_NAME);
7531   new_vec = build_vector_from_val (step_vectype, t);
7532   vec_step = vect_init_vector (stmt_info, new_vec, step_vectype, NULL);
7533
7534
7535   /* Create the following def-use cycle:
7536      loop prolog:
7537          vec_init = ...
7538          vec_step = ...
7539      loop:
7540          vec_iv = PHI <vec_init, vec_loop>
7541          ...
7542          STMT
7543          ...
7544          vec_loop = vec_iv + vec_step;  */
7545
7546   /* Create the induction-phi that defines the induction-operand.  */
7547   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7548   induction_phi = create_phi_node (vec_dest, iv_loop->header);
7549   stmt_vec_info induction_phi_info = loop_vinfo->add_stmt (induction_phi);
7550   induc_def = PHI_RESULT (induction_phi);
7551
7552   /* Create the iv update inside the loop  */
7553   stmts = NULL;
7554   vec_def = gimple_convert (&stmts, step_vectype, induc_def);
7555   vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
7556   vec_def = gimple_convert (&stmts, vectype, vec_def);
7557   gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
7558   new_stmt = SSA_NAME_DEF_STMT (vec_def);
7559   stmt_vec_info new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7560
7561   /* Set the arguments of the phi node:  */
7562   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7563   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7564                UNKNOWN_LOCATION);
7565
7566   STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi_info;
7567
7568   /* In case that vectorization factor (VF) is bigger than the number
7569      of elements that we can fit in a vectype (nunits), we have to generate
7570      more than one vector stmt - i.e - we need to "unroll" the
7571      vector stmt by a factor VF/nunits.  For more details see documentation
7572      in vectorizable_operation.  */
7573
7574   if (ncopies > 1)
7575     {
7576       gimple_seq seq = NULL;
7577       stmt_vec_info prev_stmt_vinfo;
7578       /* FORNOW. This restriction should be relaxed.  */
7579       gcc_assert (!nested_in_vect_loop);
7580
7581       /* Create the vector that holds the step of the induction.  */
7582       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7583         {
7584           expr = build_int_cst (integer_type_node, nunits);
7585           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7586         }
7587       else
7588         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
7589       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7590                                expr, step_expr);
7591       if (seq)
7592         {
7593           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7594           gcc_assert (!new_bb);
7595         }
7596
7597       t = unshare_expr (new_name);
7598       gcc_assert (CONSTANT_CLASS_P (new_name)
7599                   || TREE_CODE (new_name) == SSA_NAME);
7600       new_vec = build_vector_from_val (step_vectype, t);
7601       vec_step = vect_init_vector (stmt_info, new_vec, step_vectype, NULL);
7602
7603       vec_def = induc_def;
7604       prev_stmt_vinfo = induction_phi_info;
7605       for (i = 1; i < ncopies; i++)
7606         {
7607           /* vec_i = vec_prev + vec_step  */
7608           gimple_seq stmts = NULL;
7609           vec_def = gimple_convert (&stmts, step_vectype, vec_def);
7610           vec_def = gimple_build (&stmts,
7611                                   PLUS_EXPR, step_vectype, vec_def, vec_step);
7612           vec_def = gimple_convert (&stmts, vectype, vec_def);
7613
7614           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
7615           new_stmt = SSA_NAME_DEF_STMT (vec_def);
7616           new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7617           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt_info;
7618           prev_stmt_vinfo = new_stmt_info;
7619         }
7620     }
7621
7622   if (nested_in_vect_loop)
7623     {
7624       /* Find the loop-closed exit-phi of the induction, and record
7625          the final vector of induction results:  */
7626       exit_phi = NULL;
7627       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7628         {
7629           gimple *use_stmt = USE_STMT (use_p);
7630           if (is_gimple_debug (use_stmt))
7631             continue;
7632
7633           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
7634             {
7635               exit_phi = use_stmt;
7636               break;
7637             }
7638         }
7639       if (exit_phi)
7640         {
7641           stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7642           /* FORNOW. Currently not supporting the case that an inner-loop induction
7643              is not used in the outer-loop (i.e. only outside the outer-loop).  */
7644           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
7645                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
7646
7647           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt_info;
7648           if (dump_enabled_p ())
7649             dump_printf_loc (MSG_NOTE, vect_location,
7650                              "vector of inductions after inner-loop:%G",
7651                              new_stmt);
7652         }
7653     }
7654
7655
7656   if (dump_enabled_p ())
7657     dump_printf_loc (MSG_NOTE, vect_location,
7658                      "transform induction: created def-use cycle: %G%G",
7659                      induction_phi, SSA_NAME_DEF_STMT (vec_def));
7660
7661   return true;
7662 }
7663
7664 /* Function vectorizable_live_operation.
7665
7666    STMT_INFO computes a value that is used outside the loop.  Check if
7667    it can be supported.  */
7668
7669 bool
7670 vectorizable_live_operation (stmt_vec_info stmt_info,
7671                              gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7672                              slp_tree slp_node, slp_instance slp_node_instance,
7673                              int slp_index, stmt_vec_info *vec_stmt,
7674                              stmt_vector_for_cost *)
7675 {
7676   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7677   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7678   imm_use_iterator imm_iter;
7679   tree lhs, lhs_type, bitsize, vec_bitsize;
7680   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7681   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7682   int ncopies;
7683   gimple *use_stmt;
7684   auto_vec<tree> vec_oprnds;
7685   int vec_entry = 0;
7686   poly_uint64 vec_index = 0;
7687
7688   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
7689
7690   /* The last stmt of a reduction is live and vectorized via
7691      vect_create_epilog_for_reduction.  vectorizable_reduction assessed
7692      validity so just trigger the transform here.  */
7693   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
7694       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
7695     {
7696       if (!vec_stmt)
7697         return true;
7698       if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
7699         {
7700           if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
7701             return true;
7702           if (slp_node)
7703             {
7704               /* For reduction chains the meta-info is attached to
7705                  the group leader.  */
7706               if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7707                 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
7708               /* For SLP reductions we vectorize the epilogue for
7709                  all involved stmts together.  */
7710               else if (slp_index != 0)
7711                 return true;
7712             }
7713         }
7714       vect_create_epilog_for_reduction (stmt_info, slp_node,
7715                                         slp_node_instance);
7716       return true;
7717     }
7718
7719   /* FORNOW.  CHECKME.  */
7720   if (nested_in_vect_loop_p (loop, stmt_info))
7721     return false;
7722
7723   /* If STMT is not relevant and it is a simple assignment and its inputs are
7724      invariant then it can remain in place, unvectorized.  The original last
7725      scalar value that it computes will be used.  */
7726   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7727     {
7728       gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
7729       if (dump_enabled_p ())
7730         dump_printf_loc (MSG_NOTE, vect_location,
7731                          "statement is simple and uses invariant.  Leaving in "
7732                          "place.\n");
7733       return true;
7734     }
7735
7736   if (slp_node)
7737     ncopies = 1;
7738   else
7739     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7740
7741   if (slp_node)
7742     {
7743       gcc_assert (slp_index >= 0);
7744
7745       int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7746       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7747
7748       /* Get the last occurrence of the scalar index from the concatenation of
7749          all the slp vectors. Calculate which slp vector it is and the index
7750          within.  */
7751       poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
7752
7753       /* Calculate which vector contains the result, and which lane of
7754          that vector we need.  */
7755       if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
7756         {
7757           if (dump_enabled_p ())
7758             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7759                              "Cannot determine which vector holds the"
7760                              " final result.\n");
7761           return false;
7762         }
7763     }
7764
7765   if (!vec_stmt)
7766     {
7767       /* No transformation required.  */
7768       if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
7769         {
7770           if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
7771                                                OPTIMIZE_FOR_SPEED))
7772             {
7773               if (dump_enabled_p ())
7774                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7775                                  "can't use a fully-masked loop because "
7776                                  "the target doesn't support extract last "
7777                                  "reduction.\n");
7778               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7779             }
7780           else if (slp_node)
7781             {
7782               if (dump_enabled_p ())
7783                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7784                                  "can't use a fully-masked loop because an "
7785                                  "SLP statement is live after the loop.\n");
7786               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7787             }
7788           else if (ncopies > 1)
7789             {
7790               if (dump_enabled_p ())
7791                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7792                                  "can't use a fully-masked loop because"
7793                                  " ncopies is greater than 1.\n");
7794               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7795             }
7796           else
7797             {
7798               gcc_assert (ncopies == 1 && !slp_node);
7799               vect_record_loop_mask (loop_vinfo,
7800                                      &LOOP_VINFO_MASKS (loop_vinfo),
7801                                      1, vectype);
7802             }
7803         }
7804       return true;
7805     }
7806
7807   /* Use the lhs of the original scalar statement.  */
7808   gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
7809
7810   lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
7811         : gimple_get_lhs (stmt);
7812   lhs_type = TREE_TYPE (lhs);
7813
7814   bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype)
7815              ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype)))
7816              : TYPE_SIZE (TREE_TYPE (vectype)));
7817   vec_bitsize = TYPE_SIZE (vectype);
7818
7819   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
7820   tree vec_lhs, bitstart;
7821   if (slp_node)
7822     {
7823       gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
7824
7825       /* Get the correct slp vectorized stmt.  */
7826       gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry]->stmt;
7827       if (gphi *phi = dyn_cast <gphi *> (vec_stmt))
7828         vec_lhs = gimple_phi_result (phi);
7829       else
7830         vec_lhs = gimple_get_lhs (vec_stmt);
7831
7832       /* Get entry to use.  */
7833       bitstart = bitsize_int (vec_index);
7834       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
7835     }
7836   else
7837     {
7838       enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
7839       vec_lhs = vect_get_vec_def_for_operand_1 (stmt_info, dt);
7840       gcc_checking_assert (ncopies == 1
7841                            || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
7842
7843       /* For multiple copies, get the last copy.  */
7844       for (int i = 1; i < ncopies; ++i)
7845         vec_lhs = vect_get_vec_def_for_stmt_copy (loop_vinfo, vec_lhs);
7846
7847       /* Get the last lane in the vector.  */
7848       bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
7849     }
7850
7851   gimple_seq stmts = NULL;
7852   tree new_tree;
7853   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7854     {
7855       /* Emit:
7856
7857            SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
7858
7859          where VEC_LHS is the vectorized live-out result and MASK is
7860          the loop mask for the final iteration.  */
7861       gcc_assert (ncopies == 1 && !slp_node);
7862       tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
7863       tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
7864                                       1, vectype, 0);
7865       tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST,
7866                                       scalar_type, mask, vec_lhs);
7867
7868       /* Convert the extracted vector element to the required scalar type.  */
7869       new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
7870     }
7871   else
7872     {
7873       tree bftype = TREE_TYPE (vectype);
7874       if (VECTOR_BOOLEAN_TYPE_P (vectype))
7875         bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
7876       new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs, bitsize, bitstart);
7877       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
7878                                        &stmts, true, NULL_TREE);
7879     }
7880
7881   if (stmts)
7882     gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts);
7883
7884   /* Replace use of lhs with newly computed result.  If the use stmt is a
7885      single arg PHI, just replace all uses of PHI result.  It's necessary
7886      because lcssa PHI defining lhs may be before newly inserted stmt.  */
7887   use_operand_p use_p;
7888   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
7889     if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
7890         && !is_gimple_debug (use_stmt))
7891     {
7892       if (gimple_code (use_stmt) == GIMPLE_PHI
7893           && gimple_phi_num_args (use_stmt) == 1)
7894         {
7895           replace_uses_by (gimple_phi_result (use_stmt), new_tree);
7896         }
7897       else
7898         {
7899           FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
7900             SET_USE (use_p, new_tree);
7901         }
7902       update_stmt (use_stmt);
7903     }
7904
7905   return true;
7906 }
7907
7908 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO.  */
7909
7910 static void
7911 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
7912 {
7913   ssa_op_iter op_iter;
7914   imm_use_iterator imm_iter;
7915   def_operand_p def_p;
7916   gimple *ustmt;
7917
7918   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
7919     {
7920       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
7921         {
7922           basic_block bb;
7923
7924           if (!is_gimple_debug (ustmt))
7925             continue;
7926
7927           bb = gimple_bb (ustmt);
7928
7929           if (!flow_bb_inside_loop_p (loop, bb))
7930             {
7931               if (gimple_debug_bind_p (ustmt))
7932                 {
7933                   if (dump_enabled_p ())
7934                     dump_printf_loc (MSG_NOTE, vect_location,
7935                                      "killing debug use\n");
7936
7937                   gimple_debug_bind_reset_value (ustmt);
7938                   update_stmt (ustmt);
7939                 }
7940               else
7941                 gcc_unreachable ();
7942             }
7943         }
7944     }
7945 }
7946
7947 /* Given loop represented by LOOP_VINFO, return true if computation of
7948    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
7949    otherwise.  */
7950
7951 static bool
7952 loop_niters_no_overflow (loop_vec_info loop_vinfo)
7953 {
7954   /* Constant case.  */
7955   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
7956     {
7957       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
7958       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
7959
7960       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
7961       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
7962       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
7963         return true;
7964     }
7965
7966   widest_int max;
7967   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7968   /* Check the upper bound of loop niters.  */
7969   if (get_max_loop_iterations (loop, &max))
7970     {
7971       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
7972       signop sgn = TYPE_SIGN (type);
7973       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
7974       if (max < type_max)
7975         return true;
7976     }
7977   return false;
7978 }
7979
7980 /* Return a mask type with half the number of elements as TYPE.  */
7981
7982 tree
7983 vect_halve_mask_nunits (tree type)
7984 {
7985   poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (type), 2);
7986   return build_truth_vector_type (nunits, current_vector_size);
7987 }
7988
7989 /* Return a mask type with twice as many elements as TYPE.  */
7990
7991 tree
7992 vect_double_mask_nunits (tree type)
7993 {
7994   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (type) * 2;
7995   return build_truth_vector_type (nunits, current_vector_size);
7996 }
7997
7998 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
7999    contain a sequence of NVECTORS masks that each control a vector of type
8000    VECTYPE.  */
8001
8002 void
8003 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
8004                        unsigned int nvectors, tree vectype)
8005 {
8006   gcc_assert (nvectors != 0);
8007   if (masks->length () < nvectors)
8008     masks->safe_grow_cleared (nvectors);
8009   rgroup_masks *rgm = &(*masks)[nvectors - 1];
8010   /* The number of scalars per iteration and the number of vectors are
8011      both compile-time constants.  */
8012   unsigned int nscalars_per_iter
8013     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8014                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8015   if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8016     {
8017       rgm->max_nscalars_per_iter = nscalars_per_iter;
8018       rgm->mask_type = build_same_sized_truth_vector_type (vectype);
8019     }
8020 }
8021
8022 /* Given a complete set of masks MASKS, extract mask number INDEX
8023    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8024    where 0 <= INDEX < NVECTORS.  Insert any set-up statements before GSI.
8025
8026    See the comment above vec_loop_masks for more details about the mask
8027    arrangement.  */
8028
8029 tree
8030 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
8031                     unsigned int nvectors, tree vectype, unsigned int index)
8032 {
8033   rgroup_masks *rgm = &(*masks)[nvectors - 1];
8034   tree mask_type = rgm->mask_type;
8035
8036   /* Populate the rgroup's mask array, if this is the first time we've
8037      used it.  */
8038   if (rgm->masks.is_empty ())
8039     {
8040       rgm->masks.safe_grow_cleared (nvectors);
8041       for (unsigned int i = 0; i < nvectors; ++i)
8042         {
8043           tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
8044           /* Provide a dummy definition until the real one is available.  */
8045           SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
8046           rgm->masks[i] = mask;
8047         }
8048     }
8049
8050   tree mask = rgm->masks[index];
8051   if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
8052                 TYPE_VECTOR_SUBPARTS (vectype)))
8053     {
8054       /* A loop mask for data type X can be reused for data type Y
8055          if X has N times more elements than Y and if Y's elements
8056          are N times bigger than X's.  In this case each sequence
8057          of N elements in the loop mask will be all-zero or all-one.
8058          We can then view-convert the mask so that each sequence of
8059          N elements is replaced by a single element.  */
8060       gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
8061                               TYPE_VECTOR_SUBPARTS (vectype)));
8062       gimple_seq seq = NULL;
8063       mask_type = build_same_sized_truth_vector_type (vectype);
8064       mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
8065       if (seq)
8066         gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
8067     }
8068   return mask;
8069 }
8070
8071 /* Scale profiling counters by estimation for LOOP which is vectorized
8072    by factor VF.  */
8073
8074 static void
8075 scale_profile_for_vect_loop (class loop *loop, unsigned vf)
8076 {
8077   edge preheader = loop_preheader_edge (loop);
8078   /* Reduce loop iterations by the vectorization factor.  */
8079   gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
8080   profile_count freq_h = loop->header->count, freq_e = preheader->count ();
8081
8082   if (freq_h.nonzero_p ())
8083     {
8084       profile_probability p;
8085
8086       /* Avoid dropping loop body profile counter to 0 because of zero count
8087          in loop's preheader.  */
8088       if (!(freq_e == profile_count::zero ()))
8089         freq_e = freq_e.force_nonzero ();
8090       p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
8091       scale_loop_frequencies (loop, p);
8092     }
8093
8094   edge exit_e = single_exit (loop);
8095   exit_e->probability = profile_probability::always ()
8096                                  .apply_scale (1, new_est_niter + 1);
8097
8098   edge exit_l = single_pred_edge (loop->latch);
8099   profile_probability prob = exit_l->probability;
8100   exit_l->probability = exit_e->probability.invert ();
8101   if (prob.initialized_p () && exit_l->probability.initialized_p ())
8102     scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
8103 }
8104
8105 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
8106    When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
8107    stmt_vec_info.  */
8108
8109 static void
8110 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8111                           gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
8112 {
8113   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8114   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8115
8116   if (dump_enabled_p ())
8117     dump_printf_loc (MSG_NOTE, vect_location,
8118                      "------>vectorizing statement: %G", stmt_info->stmt);
8119
8120   if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8121     vect_loop_kill_debug_uses (loop, stmt_info);
8122
8123   if (!STMT_VINFO_RELEVANT_P (stmt_info)
8124       && !STMT_VINFO_LIVE_P (stmt_info))
8125     return;
8126
8127   if (STMT_VINFO_VECTYPE (stmt_info))
8128     {
8129       poly_uint64 nunits
8130         = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8131       if (!STMT_SLP_TYPE (stmt_info)
8132           && maybe_ne (nunits, vf)
8133           && dump_enabled_p ())
8134         /* For SLP VF is set according to unrolling factor, and not
8135            to vector size, hence for SLP this print is not valid.  */
8136         dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8137     }
8138
8139   /* Pure SLP statements have already been vectorized.  We still need
8140      to apply loop vectorization to hybrid SLP statements.  */
8141   if (PURE_SLP_STMT (stmt_info))
8142     return;
8143
8144   if (dump_enabled_p ())
8145     dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
8146
8147   if (vect_transform_stmt (stmt_info, gsi, NULL, NULL))
8148     *seen_store = stmt_info;
8149 }
8150
8151 /* Function vect_transform_loop.
8152
8153    The analysis phase has determined that the loop is vectorizable.
8154    Vectorize the loop - created vectorized stmts to replace the scalar
8155    stmts in the loop, and update the loop exit condition.
8156    Returns scalar epilogue loop if any.  */
8157
8158 class loop *
8159 vect_transform_loop (loop_vec_info loop_vinfo)
8160 {
8161   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8162   class loop *epilogue = NULL;
8163   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8164   int nbbs = loop->num_nodes;
8165   int i;
8166   tree niters_vector = NULL_TREE;
8167   tree step_vector = NULL_TREE;
8168   tree niters_vector_mult_vf = NULL_TREE;
8169   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8170   unsigned int lowest_vf = constant_lower_bound (vf);
8171   gimple *stmt;
8172   bool check_profitability = false;
8173   unsigned int th;
8174
8175   DUMP_VECT_SCOPE ("vec_transform_loop");
8176
8177   loop_vinfo->shared->check_datarefs ();
8178
8179   /* Use the more conservative vectorization threshold.  If the number
8180      of iterations is constant assume the cost check has been performed
8181      by our caller.  If the threshold makes all loops profitable that
8182      run at least the (estimated) vectorization factor number of times
8183      checking is pointless, too.  */
8184   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
8185   if (th >= vect_vf_for_cost (loop_vinfo)
8186       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8187     {
8188       if (dump_enabled_p ())
8189         dump_printf_loc (MSG_NOTE, vect_location,
8190                          "Profitability threshold is %d loop iterations.\n",
8191                          th);
8192       check_profitability = true;
8193     }
8194
8195   /* Make sure there exists a single-predecessor exit bb.  Do this before
8196      versioning.   */
8197   edge e = single_exit (loop);
8198   if (! single_pred_p (e->dest))
8199     {
8200       split_loop_exit_edge (e, true);
8201       if (dump_enabled_p ())
8202         dump_printf (MSG_NOTE, "split exit edge\n");
8203     }
8204
8205   /* Version the loop first, if required, so the profitability check
8206      comes first.  */
8207
8208   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
8209     {
8210       poly_uint64 versioning_threshold
8211         = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
8212       if (check_profitability
8213           && ordered_p (poly_uint64 (th), versioning_threshold))
8214         {
8215           versioning_threshold = ordered_max (poly_uint64 (th),
8216                                               versioning_threshold);
8217           check_profitability = false;
8218         }
8219       class loop *sloop
8220         = vect_loop_versioning (loop_vinfo, th, check_profitability,
8221                                 versioning_threshold);
8222       sloop->force_vectorize = false;
8223       check_profitability = false;
8224     }
8225
8226   /* Make sure there exists a single-predecessor exit bb also on the
8227      scalar loop copy.  Do this after versioning but before peeling
8228      so CFG structure is fine for both scalar and if-converted loop
8229      to make slpeel_duplicate_current_defs_from_edges face matched
8230      loop closed PHI nodes on the exit.  */
8231   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8232     {
8233       e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
8234       if (! single_pred_p (e->dest))
8235         {
8236           split_loop_exit_edge (e, true);
8237           if (dump_enabled_p ())
8238             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
8239         }
8240     }
8241
8242   tree niters = vect_build_loop_niters (loop_vinfo);
8243   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
8244   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
8245   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
8246   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
8247                               &step_vector, &niters_vector_mult_vf, th,
8248                               check_profitability, niters_no_overflow);
8249   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
8250       && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
8251     scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
8252                             LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
8253
8254   if (niters_vector == NULL_TREE)
8255     {
8256       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8257           && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8258           && known_eq (lowest_vf, vf))
8259         {
8260           niters_vector
8261             = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
8262                              LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
8263           step_vector = build_one_cst (TREE_TYPE (niters));
8264         }
8265       else
8266         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
8267                                      &step_vector, niters_no_overflow);
8268     }
8269
8270   /* 1) Make sure the loop header has exactly two entries
8271      2) Make sure we have a preheader basic block.  */
8272
8273   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
8274
8275   split_edge (loop_preheader_edge (loop));
8276
8277   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8278       && vect_use_loop_mask_for_alignment_p (loop_vinfo))
8279     /* This will deal with any possible peeling.  */
8280     vect_prepare_for_masked_peels (loop_vinfo);
8281
8282   /* Schedule the SLP instances first, then handle loop vectorization
8283      below.  */
8284   if (!loop_vinfo->slp_instances.is_empty ())
8285     {
8286       DUMP_VECT_SCOPE ("scheduling SLP instances");
8287       vect_schedule_slp (loop_vinfo);
8288     }
8289
8290   /* FORNOW: the vectorizer supports only loops which body consist
8291      of one basic block (header + empty latch). When the vectorizer will
8292      support more involved loop forms, the order by which the BBs are
8293      traversed need to be reconsidered.  */
8294
8295   for (i = 0; i < nbbs; i++)
8296     {
8297       basic_block bb = bbs[i];
8298       stmt_vec_info stmt_info;
8299
8300       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
8301            gsi_next (&si))
8302         {
8303           gphi *phi = si.phi ();
8304           if (dump_enabled_p ())
8305             dump_printf_loc (MSG_NOTE, vect_location,
8306                              "------>vectorizing phi: %G", phi);
8307           stmt_info = loop_vinfo->lookup_stmt (phi);
8308           if (!stmt_info)
8309             continue;
8310
8311           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8312             vect_loop_kill_debug_uses (loop, stmt_info);
8313
8314           if (!STMT_VINFO_RELEVANT_P (stmt_info)
8315               && !STMT_VINFO_LIVE_P (stmt_info))
8316             continue;
8317
8318           if (STMT_VINFO_VECTYPE (stmt_info)
8319               && (maybe_ne
8320                   (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
8321               && dump_enabled_p ())
8322             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8323
8324           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
8325                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8326                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
8327                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
8328                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
8329               && ! PURE_SLP_STMT (stmt_info))
8330             {
8331               if (dump_enabled_p ())
8332                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
8333               vect_transform_stmt (stmt_info, NULL, NULL, NULL);
8334             }
8335         }
8336
8337       for (gimple_stmt_iterator si = gsi_start_bb (bb);
8338            !gsi_end_p (si);)
8339         {
8340           stmt = gsi_stmt (si);
8341           /* During vectorization remove existing clobber stmts.  */
8342           if (gimple_clobber_p (stmt))
8343             {
8344               unlink_stmt_vdef (stmt);
8345               gsi_remove (&si, true);
8346               release_defs (stmt);
8347             }
8348           else
8349             {
8350               stmt_info = loop_vinfo->lookup_stmt (stmt);
8351
8352               /* vector stmts created in the outer-loop during vectorization of
8353                  stmts in an inner-loop may not have a stmt_info, and do not
8354                  need to be vectorized.  */
8355               stmt_vec_info seen_store = NULL;
8356               if (stmt_info)
8357                 {
8358                   if (STMT_VINFO_IN_PATTERN_P (stmt_info))
8359                     {
8360                       gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
8361                       for (gimple_stmt_iterator subsi = gsi_start (def_seq);
8362                            !gsi_end_p (subsi); gsi_next (&subsi))
8363                         {
8364                           stmt_vec_info pat_stmt_info
8365                             = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
8366                           vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
8367                                                     &si, &seen_store);
8368                         }
8369                       stmt_vec_info pat_stmt_info
8370                         = STMT_VINFO_RELATED_STMT (stmt_info);
8371                       vect_transform_loop_stmt (loop_vinfo, pat_stmt_info, &si,
8372                                                 &seen_store);
8373                     }
8374                   vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
8375                                             &seen_store);
8376                 }
8377               gsi_next (&si);
8378               if (seen_store)
8379                 {
8380                   if (STMT_VINFO_GROUPED_ACCESS (seen_store))
8381                     /* Interleaving.  If IS_STORE is TRUE, the
8382                        vectorization of the interleaving chain was
8383                        completed - free all the stores in the chain.  */
8384                     vect_remove_stores (DR_GROUP_FIRST_ELEMENT (seen_store));
8385                   else
8386                     /* Free the attached stmt_vec_info and remove the stmt.  */
8387                     loop_vinfo->remove_stmt (stmt_info);
8388                 }
8389             }
8390         }
8391
8392       /* Stub out scalar statements that must not survive vectorization.
8393          Doing this here helps with grouped statements, or statements that
8394          are involved in patterns.  */
8395       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
8396            !gsi_end_p (gsi); gsi_next (&gsi))
8397         {
8398           gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
8399           if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
8400             {
8401               tree lhs = gimple_get_lhs (call);
8402               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8403                 {
8404                   tree zero = build_zero_cst (TREE_TYPE (lhs));
8405                   gimple *new_stmt = gimple_build_assign (lhs, zero);
8406                   gsi_replace (&gsi, new_stmt, true);
8407                 }
8408             }
8409         }
8410     }                           /* BBs in loop */
8411
8412   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
8413      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
8414   if (integer_onep (step_vector))
8415     niters_no_overflow = true;
8416   vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
8417                            niters_vector_mult_vf, !niters_no_overflow);
8418
8419   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
8420   scale_profile_for_vect_loop (loop, assumed_vf);
8421
8422   /* True if the final iteration might not handle a full vector's
8423      worth of scalar iterations.  */
8424   bool final_iter_may_be_partial = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8425   /* The minimum number of iterations performed by the epilogue.  This
8426      is 1 when peeling for gaps because we always need a final scalar
8427      iteration.  */
8428   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
8429   /* +1 to convert latch counts to loop iteration counts,
8430      -min_epilogue_iters to remove iterations that cannot be performed
8431        by the vector code.  */
8432   int bias_for_lowest = 1 - min_epilogue_iters;
8433   int bias_for_assumed = bias_for_lowest;
8434   int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
8435   if (alignment_npeels && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8436     {
8437       /* When the amount of peeling is known at compile time, the first
8438          iteration will have exactly alignment_npeels active elements.
8439          In the worst case it will have at least one.  */
8440       int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
8441       bias_for_lowest += lowest_vf - min_first_active;
8442       bias_for_assumed += assumed_vf - min_first_active;
8443     }
8444   /* In these calculations the "- 1" converts loop iteration counts
8445      back to latch counts.  */
8446   if (loop->any_upper_bound)
8447     loop->nb_iterations_upper_bound
8448       = (final_iter_may_be_partial
8449          ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
8450                           lowest_vf) - 1
8451          : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
8452                            lowest_vf) - 1);
8453   if (loop->any_likely_upper_bound)
8454     loop->nb_iterations_likely_upper_bound
8455       = (final_iter_may_be_partial
8456          ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
8457                           + bias_for_lowest, lowest_vf) - 1
8458          : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
8459                            + bias_for_lowest, lowest_vf) - 1);
8460   if (loop->any_estimate)
8461     loop->nb_iterations_estimate
8462       = (final_iter_may_be_partial
8463          ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
8464                           assumed_vf) - 1
8465          : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
8466                            assumed_vf) - 1);
8467
8468   if (dump_enabled_p ())
8469     {
8470       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8471         {
8472           dump_printf_loc (MSG_NOTE, vect_location,
8473                            "LOOP VECTORIZED\n");
8474           if (loop->inner)
8475             dump_printf_loc (MSG_NOTE, vect_location,
8476                              "OUTER LOOP VECTORIZED\n");
8477           dump_printf (MSG_NOTE, "\n");
8478         }
8479       else
8480         {
8481           dump_printf_loc (MSG_NOTE, vect_location,
8482                            "LOOP EPILOGUE VECTORIZED (VS=");
8483           dump_dec (MSG_NOTE, current_vector_size);
8484           dump_printf (MSG_NOTE, ")\n");
8485         }
8486     }
8487
8488   /* Loops vectorized with a variable factor won't benefit from
8489      unrolling/peeling.  */
8490   if (!vf.is_constant ())
8491     {
8492       loop->unroll = 1;
8493       if (dump_enabled_p ())
8494         dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
8495                          " variable-length vectorization factor\n");
8496     }
8497   /* Free SLP instances here because otherwise stmt reference counting
8498      won't work.  */
8499   slp_instance instance;
8500   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
8501     vect_free_slp_instance (instance, true);
8502   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
8503   /* Clear-up safelen field since its value is invalid after vectorization
8504      since vectorized loop can have loop-carried dependencies.  */
8505   loop->safelen = 0;
8506
8507   /* Don't vectorize epilogue for epilogue.  */
8508   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8509     epilogue = NULL;
8510
8511   if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
8512     epilogue = NULL;
8513
8514   if (epilogue)
8515     {
8516       auto_vector_sizes vector_sizes;
8517       targetm.vectorize.autovectorize_vector_sizes (&vector_sizes, false);
8518       unsigned int next_size = 0;
8519
8520       /* Note LOOP_VINFO_NITERS_KNOWN_P and LOOP_VINFO_INT_NITERS work
8521          on niters already ajusted for the iterations of the prologue.  */
8522       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8523           && known_eq (vf, lowest_vf))
8524         {
8525           unsigned HOST_WIDE_INT eiters
8526             = (LOOP_VINFO_INT_NITERS (loop_vinfo)
8527                - LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
8528           eiters
8529             = eiters % lowest_vf + LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
8530           epilogue->nb_iterations_upper_bound = eiters - 1;
8531           epilogue->any_upper_bound = true;
8532
8533           unsigned int ratio;
8534           while (next_size < vector_sizes.length ()
8535                  && !(constant_multiple_p (current_vector_size,
8536                                            vector_sizes[next_size], &ratio)
8537                       && eiters >= lowest_vf / ratio))
8538             next_size += 1;
8539         }
8540       else
8541         while (next_size < vector_sizes.length ()
8542                && maybe_lt (current_vector_size, vector_sizes[next_size]))
8543           next_size += 1;
8544
8545       if (next_size == vector_sizes.length ())
8546         epilogue = NULL;
8547     }
8548
8549   if (epilogue)
8550     {
8551       epilogue->force_vectorize = loop->force_vectorize;
8552       epilogue->safelen = loop->safelen;
8553       epilogue->dont_vectorize = false;
8554
8555       /* We may need to if-convert epilogue to vectorize it.  */
8556       if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8557         tree_if_conversion (epilogue);
8558     }
8559
8560   return epilogue;
8561 }
8562
8563 /* The code below is trying to perform simple optimization - revert
8564    if-conversion for masked stores, i.e. if the mask of a store is zero
8565    do not perform it and all stored value producers also if possible.
8566    For example,
8567      for (i=0; i<n; i++)
8568        if (c[i])
8569         {
8570           p1[i] += 1;
8571           p2[i] = p3[i] +2;
8572         }
8573    this transformation will produce the following semi-hammock:
8574
8575    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
8576      {
8577        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
8578        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
8579        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
8580        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
8581        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
8582        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
8583      }
8584 */
8585
8586 void
8587 optimize_mask_stores (class loop *loop)
8588 {
8589   basic_block *bbs = get_loop_body (loop);
8590   unsigned nbbs = loop->num_nodes;
8591   unsigned i;
8592   basic_block bb;
8593   class loop *bb_loop;
8594   gimple_stmt_iterator gsi;
8595   gimple *stmt;
8596   auto_vec<gimple *> worklist;
8597   auto_purge_vect_location sentinel;
8598
8599   vect_location = find_loop_location (loop);
8600   /* Pick up all masked stores in loop if any.  */
8601   for (i = 0; i < nbbs; i++)
8602     {
8603       bb = bbs[i];
8604       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
8605            gsi_next (&gsi))
8606         {
8607           stmt = gsi_stmt (gsi);
8608           if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
8609             worklist.safe_push (stmt);
8610         }
8611     }
8612
8613   free (bbs);
8614   if (worklist.is_empty ())
8615     return;
8616
8617   /* Loop has masked stores.  */
8618   while (!worklist.is_empty ())
8619     {
8620       gimple *last, *last_store;
8621       edge e, efalse;
8622       tree mask;
8623       basic_block store_bb, join_bb;
8624       gimple_stmt_iterator gsi_to;
8625       tree vdef, new_vdef;
8626       gphi *phi;
8627       tree vectype;
8628       tree zero;
8629
8630       last = worklist.pop ();
8631       mask = gimple_call_arg (last, 2);
8632       bb = gimple_bb (last);
8633       /* Create then_bb and if-then structure in CFG, then_bb belongs to
8634          the same loop as if_bb.  It could be different to LOOP when two
8635          level loop-nest is vectorized and mask_store belongs to the inner
8636          one.  */
8637       e = split_block (bb, last);
8638       bb_loop = bb->loop_father;
8639       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
8640       join_bb = e->dest;
8641       store_bb = create_empty_bb (bb);
8642       add_bb_to_loop (store_bb, bb_loop);
8643       e->flags = EDGE_TRUE_VALUE;
8644       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
8645       /* Put STORE_BB to likely part.  */
8646       efalse->probability = profile_probability::unlikely ();
8647       store_bb->count = efalse->count ();
8648       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
8649       if (dom_info_available_p (CDI_DOMINATORS))
8650         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
8651       if (dump_enabled_p ())
8652         dump_printf_loc (MSG_NOTE, vect_location,
8653                          "Create new block %d to sink mask stores.",
8654                          store_bb->index);
8655       /* Create vector comparison with boolean result.  */
8656       vectype = TREE_TYPE (mask);
8657       zero = build_zero_cst (vectype);
8658       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
8659       gsi = gsi_last_bb (bb);
8660       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
8661       /* Create new PHI node for vdef of the last masked store:
8662          .MEM_2 = VDEF <.MEM_1>
8663          will be converted to
8664          .MEM.3 = VDEF <.MEM_1>
8665          and new PHI node will be created in join bb
8666          .MEM_2 = PHI <.MEM_1, .MEM_3>
8667       */
8668       vdef = gimple_vdef (last);
8669       new_vdef = make_ssa_name (gimple_vop (cfun), last);
8670       gimple_set_vdef (last, new_vdef);
8671       phi = create_phi_node (vdef, join_bb);
8672       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
8673
8674       /* Put all masked stores with the same mask to STORE_BB if possible.  */
8675       while (true)
8676         {
8677           gimple_stmt_iterator gsi_from;
8678           gimple *stmt1 = NULL;
8679
8680           /* Move masked store to STORE_BB.  */
8681           last_store = last;
8682           gsi = gsi_for_stmt (last);
8683           gsi_from = gsi;
8684           /* Shift GSI to the previous stmt for further traversal.  */
8685           gsi_prev (&gsi);
8686           gsi_to = gsi_start_bb (store_bb);
8687           gsi_move_before (&gsi_from, &gsi_to);
8688           /* Setup GSI_TO to the non-empty block start.  */
8689           gsi_to = gsi_start_bb (store_bb);
8690           if (dump_enabled_p ())
8691             dump_printf_loc (MSG_NOTE, vect_location,
8692                              "Move stmt to created bb\n%G", last);
8693           /* Move all stored value producers if possible.  */
8694           while (!gsi_end_p (gsi))
8695             {
8696               tree lhs;
8697               imm_use_iterator imm_iter;
8698               use_operand_p use_p;
8699               bool res;
8700
8701               /* Skip debug statements.  */
8702               if (is_gimple_debug (gsi_stmt (gsi)))
8703                 {
8704                   gsi_prev (&gsi);
8705                   continue;
8706                 }
8707               stmt1 = gsi_stmt (gsi);
8708               /* Do not consider statements writing to memory or having
8709                  volatile operand.  */
8710               if (gimple_vdef (stmt1)
8711                   || gimple_has_volatile_ops (stmt1))
8712                 break;
8713               gsi_from = gsi;
8714               gsi_prev (&gsi);
8715               lhs = gimple_get_lhs (stmt1);
8716               if (!lhs)
8717                 break;
8718
8719               /* LHS of vectorized stmt must be SSA_NAME.  */
8720               if (TREE_CODE (lhs) != SSA_NAME)
8721                 break;
8722
8723               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8724                 {
8725                   /* Remove dead scalar statement.  */
8726                   if (has_zero_uses (lhs))
8727                     {
8728                       gsi_remove (&gsi_from, true);
8729                       continue;
8730                     }
8731                 }
8732
8733               /* Check that LHS does not have uses outside of STORE_BB.  */
8734               res = true;
8735               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
8736                 {
8737                   gimple *use_stmt;
8738                   use_stmt = USE_STMT (use_p);
8739                   if (is_gimple_debug (use_stmt))
8740                     continue;
8741                   if (gimple_bb (use_stmt) != store_bb)
8742                     {
8743                       res = false;
8744                       break;
8745                     }
8746                 }
8747               if (!res)
8748                 break;
8749
8750               if (gimple_vuse (stmt1)
8751                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
8752                 break;
8753
8754               /* Can move STMT1 to STORE_BB.  */
8755               if (dump_enabled_p ())
8756                 dump_printf_loc (MSG_NOTE, vect_location,
8757                                  "Move stmt to created bb\n%G", stmt1);
8758               gsi_move_before (&gsi_from, &gsi_to);
8759               /* Shift GSI_TO for further insertion.  */
8760               gsi_prev (&gsi_to);
8761             }
8762           /* Put other masked stores with the same mask to STORE_BB.  */
8763           if (worklist.is_empty ()
8764               || gimple_call_arg (worklist.last (), 2) != mask
8765               || worklist.last () != stmt1)
8766             break;
8767           last = worklist.pop ();
8768         }
8769       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
8770     }
8771 }
8772
8773 /* Decide whether it is possible to use a zero-based induction variable
8774    when vectorizing LOOP_VINFO with a fully-masked loop.  If it is,
8775    return the value that the induction variable must be able to hold
8776    in order to ensure that the loop ends with an all-false mask.
8777    Return -1 otherwise.  */
8778 widest_int
8779 vect_iv_limit_for_full_masking (loop_vec_info loop_vinfo)
8780 {
8781   tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
8782   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8783   unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
8784
8785   /* Calculate the value that the induction variable must be able
8786      to hit in order to ensure that we end the loop with an all-false mask.
8787      This involves adding the maximum number of inactive trailing scalar
8788      iterations.  */
8789   widest_int iv_limit = -1;
8790   if (max_loop_iterations (loop, &iv_limit))
8791     {
8792       if (niters_skip)
8793         {
8794           /* Add the maximum number of skipped iterations to the
8795              maximum iteration count.  */
8796           if (TREE_CODE (niters_skip) == INTEGER_CST)
8797             iv_limit += wi::to_widest (niters_skip);
8798           else
8799             iv_limit += max_vf - 1;
8800         }
8801       else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
8802         /* Make a conservatively-correct assumption.  */
8803         iv_limit += max_vf - 1;
8804
8805       /* IV_LIMIT is the maximum number of latch iterations, which is also
8806          the maximum in-range IV value.  Round this value down to the previous
8807          vector alignment boundary and then add an extra full iteration.  */
8808       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8809       iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
8810     }
8811   return iv_limit;
8812 }
8813