gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2018 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "cfghooks.h"
  31 #include "tree-pass.h"
  32 #include "ssa.h"
  33 #include "optabs-tree.h"
  34 #include "diagnostic-core.h"
  35 #include "fold-const.h"
  36 #include "stor-layout.h"
  37 #include "cfganal.h"
  38 #include "gimplify.h"
  39 #include "gimple-iterator.h"
  40 #include "gimplify-me.h"
  41 #include "tree-ssa-loop-ivopts.h"
  42 #include "tree-ssa-loop-manip.h"
  43 #include "tree-ssa-loop-niter.h"
  44 #include "tree-ssa-loop.h"
  45 #include "cfgloop.h"
  46 #include "params.h"
  47 #include "tree-scalar-evolution.h"
  48 #include "tree-vectorizer.h"
  49 #include "gimple-fold.h"
  50 #include "cgraph.h"
  51 #include "tree-cfg.h"
  52 #include "tree-if-conv.h"
  53 #include "internal-fn.h"
  54 #include "tree-vector-builder.h"
  55 #include "vec-perm-indices.h"
  56 #include "tree-eh.h"
  57
  58 /* Loop Vectorization Pass.
  59
  60    This pass tries to vectorize loops.
  61
  62    For example, the vectorizer transforms the following simple loop:
  63
  64         short a[N]; short b[N]; short c[N]; int i;
  65
  66         for (i=0; i<N; i++){
  67           a[i] = b[i] + c[i];
  68         }
  69
  70    as if it was manually vectorized by rewriting the source code into:
  71
  72         typedef int __attribute__((mode(V8HI))) v8hi;
  73         short a[N];  short b[N]; short c[N];   int i;
  74         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  75         v8hi va, vb, vc;
  76
  77         for (i=0; i<N/8; i++){
  78           vb = pb[i];
  79           vc = pc[i];
  80           va = vb + vc;
  81           pa[i] = va;
  82         }
  83
  84         The main entry to this pass is vectorize_loops(), in which
  85    the vectorizer applies a set of analyses on a given set of loops,
  86    followed by the actual vectorization transformation for the loops that
  87    had successfully passed the analysis phase.
  88         Throughout this pass we make a distinction between two types of
  89    data: scalars (which are represented by SSA_NAMES), and memory references
  90    ("data-refs").  These two types of data require different handling both
  91    during analysis and transformation. The types of data-refs that the
  92    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  93    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  94    accesses are required to have a simple (consecutive) access pattern.
  95
  96    Analysis phase:
  97    ===============
  98         The driver for the analysis phase is vect_analyze_loop().
  99    It applies a set of analyses, some of which rely on the scalar evolution
 100    analyzer (scev) developed by Sebastian Pop.
 101
 102         During the analysis phase the vectorizer records some information
 103    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 104    loop, as well as general information about the loop as a whole, which is
 105    recorded in a "loop_vec_info" struct attached to each loop.
 106
 107    Transformation phase:
 108    =====================
 109         The loop transformation phase scans all the stmts in the loop, and
 110    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 111    the loop that needs to be vectorized.  It inserts the vector code sequence
 112    just before the scalar stmt S, and records a pointer to the vector code
 113    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 114    attached to S).  This pointer will be used for the vectorization of following
 115    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 116    otherwise, we rely on dead code elimination for removing it.
 117
 118         For example, say stmt S1 was vectorized into stmt VS1:
 119
 120    VS1: vb = px[i];
 121    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 122    S2:  a = b;
 123
 124    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 125    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 126    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 127    resulting sequence would be:
 128
 129    VS1: vb = px[i];
 130    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 131    VS2: va = vb;
 132    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 133
 134         Operands that are not SSA_NAMEs, are data-refs that appear in
 135    load/store operations (like 'x[i]' in S1), and are handled differently.
 136
 137    Target modeling:
 138    =================
 139         Currently the only target specific information that is used is the
 140    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 141    Targets that can support different sizes of vectors, for now will need
 142    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 143    flexibility will be added in the future.
 144
 145         Since we only vectorize operations which vector form can be
 146    expressed using existing tree codes, to verify that an operation is
 147    supported, the vectorizer checks the relevant optab at the relevant
 148    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 149    the value found is CODE_FOR_nothing, then there's no target support, and
 150    we can't vectorize the stmt.
 151
 152    For additional information on this project see:
 153    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 154 */
 155
 156 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 157
 158 /* Subroutine of vect_determine_vf_for_stmt that handles only one
 159    statement.  VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
 160    may already be set for general statements (not just data refs).  */
 161
 162 static bool
 163 vect_determine_vf_for_stmt_1 (stmt_vec_info stmt_info,
 164                               bool vectype_maybe_set_p,
 165                               poly_uint64 *vf,
 166                               vec<stmt_vec_info > *mask_producers)
 167 {
 168   gimple *stmt = stmt_info->stmt;
 169
 170   if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 171        && !STMT_VINFO_LIVE_P (stmt_info))
 172       || gimple_clobber_p (stmt))
 173     {
 174       if (dump_enabled_p ())
 175         dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 176       return true;
 177     }
 178
 179   tree stmt_vectype, nunits_vectype;
 180   if (!vect_get_vector_types_for_stmt (stmt_info, &stmt_vectype,
 181                                        &nunits_vectype))
 182     return false;
 183
 184   if (stmt_vectype)
 185     {
 186       if (STMT_VINFO_VECTYPE (stmt_info))
 187         /* The only case when a vectype had been already set is for stmts
 188            that contain a data ref, or for "pattern-stmts" (stmts generated
 189            by the vectorizer to represent/replace a certain idiom).  */
 190         gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
 191                      || vectype_maybe_set_p)
 192                     && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
 193       else if (stmt_vectype == boolean_type_node)
 194         mask_producers->safe_push (stmt_info);
 195       else
 196         STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
 197     }
 198
 199   if (nunits_vectype)
 200     vect_update_max_nunits (vf, nunits_vectype);
 201
 202   return true;
 203 }
 204
 205 /* Subroutine of vect_determine_vectorization_factor.  Set the vector
 206    types of STMT_INFO and all attached pattern statements and update
 207    the vectorization factor VF accordingly.  If some of the statements
 208    produce a mask result whose vector type can only be calculated later,
 209    add them to MASK_PRODUCERS.  Return true on success or false if
 210    something prevented vectorization.  */
 211
 212 static bool
 213 vect_determine_vf_for_stmt (stmt_vec_info stmt_info, poly_uint64 *vf,
 214                             vec<stmt_vec_info > *mask_producers)
 215 {
 216   if (dump_enabled_p ())
 217     {
 218       dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: ");
 219       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt_info->stmt, 0);
 220     }
 221   if (!vect_determine_vf_for_stmt_1 (stmt_info, false, vf, mask_producers))
 222     return false;
 223
 224   if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 225       && STMT_VINFO_RELATED_STMT (stmt_info))
 226     {
 227       gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 228       stmt_info = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
 229
 230       /* If a pattern statement has def stmts, analyze them too.  */
 231       for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
 232            !gsi_end_p (si); gsi_next (&si))
 233         {
 234           stmt_vec_info def_stmt_info = vinfo_for_stmt (gsi_stmt (si));
 235           if (dump_enabled_p ())
 236             {
 237               dump_printf_loc (MSG_NOTE, vect_location,
 238                                "==> examining pattern def stmt: ");
 239               dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
 240                                 def_stmt_info->stmt, 0);
 241             }
 242           if (!vect_determine_vf_for_stmt_1 (def_stmt_info, true,
 243                                              vf, mask_producers))
 244             return false;
 245         }
 246
 247       if (dump_enabled_p ())
 248         {
 249           dump_printf_loc (MSG_NOTE, vect_location,
 250                            "==> examining pattern statement: ");
 251           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt_info->stmt, 0);
 252         }
 253       if (!vect_determine_vf_for_stmt_1 (stmt_info, true, vf, mask_producers))
 254         return false;
 255     }
 256
 257   return true;
 258 }
 259
 260 /* Function vect_determine_vectorization_factor
 261
 262    Determine the vectorization factor (VF).  VF is the number of data elements
 263    that are operated upon in parallel in a single iteration of the vectorized
 264    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 265    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 266    elements can fit in a single vector register.
 267
 268    We currently support vectorization of loops in which all types operated upon
 269    are of the same size.  Therefore this function currently sets VF according to
 270    the size of the types operated upon, and fails if there are multiple sizes
 271    in the loop.
 272
 273    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 274    original loop:
 275         for (i=0; i<N; i++){
 276           a[i] = b[i] + c[i];
 277         }
 278
 279    vectorized loop:
 280         for (i=0; i<N; i+=VF){
 281           a[i:VF] = b[i:VF] + c[i:VF];
 282         }
 283 */
 284
 285 static bool
 286 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 287 {
 288   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 289   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 290   unsigned nbbs = loop->num_nodes;
 291   poly_uint64 vectorization_factor = 1;
 292   tree scalar_type = NULL_TREE;
 293   gphi *phi;
 294   tree vectype;
 295   stmt_vec_info stmt_info;
 296   unsigned i;
 297   auto_vec<stmt_vec_info> mask_producers;
 298
 299   DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
 300
 301   for (i = 0; i < nbbs; i++)
 302     {
 303       basic_block bb = bbs[i];
 304
 305       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 306            gsi_next (&si))
 307         {
 308           phi = si.phi ();
 309           stmt_info = vinfo_for_stmt (phi);
 310           if (dump_enabled_p ())
 311             {
 312               dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
 313               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 314             }
 315
 316           gcc_assert (stmt_info);
 317
 318           if (STMT_VINFO_RELEVANT_P (stmt_info)
 319               || STMT_VINFO_LIVE_P (stmt_info))
 320             {
 321               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 322               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 323
 324               if (dump_enabled_p ())
 325                 {
 326                   dump_printf_loc (MSG_NOTE, vect_location,
 327                                    "get vectype for scalar type:  ");
 328                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 329                   dump_printf (MSG_NOTE, "\n");
 330                 }
 331
 332               vectype = get_vectype_for_scalar_type (scalar_type);
 333               if (!vectype)
 334                 {
 335                   if (dump_enabled_p ())
 336                     {
 337                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 338                                        "not vectorized: unsupported "
 339                                        "data-type ");
 340                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 341                                          scalar_type);
 342                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 343                     }
 344                   return false;
 345                 }
 346               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 347
 348               if (dump_enabled_p ())
 349                 {
 350                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 351                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 352                   dump_printf (MSG_NOTE, "\n");
 353                 }
 354
 355               if (dump_enabled_p ())
 356                 {
 357                   dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
 358                   dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
 359                   dump_printf (MSG_NOTE, "\n");
 360                 }
 361
 362               vect_update_max_nunits (&vectorization_factor, vectype);
 363             }
 364         }
 365
 366       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
 367            gsi_next (&si))
 368         {
 369           stmt_info = vinfo_for_stmt (gsi_stmt (si));
 370           if (!vect_determine_vf_for_stmt (stmt_info, &vectorization_factor,
 371                                            &mask_producers))
 372             return false;
 373         }
 374     }
 375
 376   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 377   if (dump_enabled_p ())
 378     {
 379       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
 380       dump_dec (MSG_NOTE, vectorization_factor);
 381       dump_printf (MSG_NOTE, "\n");
 382     }
 383
 384   if (known_le (vectorization_factor, 1U))
 385     {
 386       if (dump_enabled_p ())
 387         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 388                          "not vectorized: unsupported data-type\n");
 389       return false;
 390     }
 391   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 392
 393   for (i = 0; i < mask_producers.length (); i++)
 394     {
 395       stmt_info = mask_producers[i];
 396       tree mask_type = vect_get_mask_type_for_stmt (stmt_info);
 397       if (!mask_type)
 398         return false;
 399       STMT_VINFO_VECTYPE (stmt_info) = mask_type;
 400     }
 401
 402   return true;
 403 }
 404
 405
 406 /* Function vect_is_simple_iv_evolution.
 407
 408    FORNOW: A simple evolution of an induction variables in the loop is
 409    considered a polynomial evolution.  */
 410
 411 static bool
 412 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 413                              tree * step)
 414 {
 415   tree init_expr;
 416   tree step_expr;
 417   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 418   basic_block bb;
 419
 420   /* When there is no evolution in this loop, the evolution function
 421      is not "simple".  */
 422   if (evolution_part == NULL_TREE)
 423     return false;
 424
 425   /* When the evolution is a polynomial of degree >= 2
 426      the evolution function is not "simple".  */
 427   if (tree_is_chrec (evolution_part))
 428     return false;
 429
 430   step_expr = evolution_part;
 431   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 432
 433   if (dump_enabled_p ())
 434     {
 435       dump_printf_loc (MSG_NOTE, vect_location, "step: ");
 436       dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
 437       dump_printf (MSG_NOTE, ",  init: ");
 438       dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
 439       dump_printf (MSG_NOTE, "\n");
 440     }
 441
 442   *init = init_expr;
 443   *step = step_expr;
 444
 445   if (TREE_CODE (step_expr) != INTEGER_CST
 446       && (TREE_CODE (step_expr) != SSA_NAME
 447           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 448               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 449           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 450               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 451                   || !flag_associative_math)))
 452       && (TREE_CODE (step_expr) != REAL_CST
 453           || !flag_associative_math))
 454     {
 455       if (dump_enabled_p ())
 456         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 457                          "step unknown.\n");
 458       return false;
 459     }
 460
 461   return true;
 462 }
 463
 464 /* Function vect_analyze_scalar_cycles_1.
 465
 466    Examine the cross iteration def-use cycles of scalar variables
 467    in LOOP.  LOOP_VINFO represents the loop that is now being
 468    considered for vectorization (can be LOOP, or an outer-loop
 469    enclosing LOOP).  */
 470
 471 static void
 472 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 473 {
 474   basic_block bb = loop->header;
 475   tree init, step;
 476   auto_vec<gimple *, 64> worklist;
 477   gphi_iterator gsi;
 478   bool double_reduc;
 479
 480   DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
 481
 482   /* First - identify all inductions.  Reduction detection assumes that all the
 483      inductions have been identified, therefore, this order must not be
 484      changed.  */
 485   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 486     {
 487       gphi *phi = gsi.phi ();
 488       tree access_fn = NULL;
 489       tree def = PHI_RESULT (phi);
 490       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 491
 492       if (dump_enabled_p ())
 493         {
 494           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 495           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 496         }
 497
 498       /* Skip virtual phi's.  The data dependences that are associated with
 499          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 500       if (virtual_operand_p (def))
 501         continue;
 502
 503       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 504
 505       /* Analyze the evolution function.  */
 506       access_fn = analyze_scalar_evolution (loop, def);
 507       if (access_fn)
 508         {
 509           STRIP_NOPS (access_fn);
 510           if (dump_enabled_p ())
 511             {
 512               dump_printf_loc (MSG_NOTE, vect_location,
 513                                "Access function of PHI: ");
 514               dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
 515               dump_printf (MSG_NOTE, "\n");
 516             }
 517           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 518             = initial_condition_in_loop_num (access_fn, loop->num);
 519           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 520             = evolution_part_in_loop_num (access_fn, loop->num);
 521         }
 522
 523       if (!access_fn
 524           || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
 525           || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 526               && TREE_CODE (step) != INTEGER_CST))
 527         {
 528           worklist.safe_push (phi);
 529           continue;
 530         }
 531
 532       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 533                   != NULL_TREE);
 534       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 535
 536       if (dump_enabled_p ())
 537         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 538       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 539     }
 540
 541
 542   /* Second - identify all reductions and nested cycles.  */
 543   while (worklist.length () > 0)
 544     {
 545       gimple *phi = worklist.pop ();
 546       tree def = PHI_RESULT (phi);
 547       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 548       gimple *reduc_stmt;
 549
 550       if (dump_enabled_p ())
 551         {
 552           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 553           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 554         }
 555
 556       gcc_assert (!virtual_operand_p (def)
 557                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 558
 559       reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi,
 560                                                 &double_reduc, false);
 561       if (reduc_stmt)
 562         {
 563           if (double_reduc)
 564             {
 565               if (dump_enabled_p ())
 566                 dump_printf_loc (MSG_NOTE, vect_location,
 567                                  "Detected double reduction.\n");
 568
 569               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 570               STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 571                                                     vect_double_reduction_def;
 572             }
 573           else
 574             {
 575               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
 576                 {
 577                   if (dump_enabled_p ())
 578                     dump_printf_loc (MSG_NOTE, vect_location,
 579                                      "Detected vectorizable nested cycle.\n");
 580
 581                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 582                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 583                                                              vect_nested_cycle;
 584                 }
 585               else
 586                 {
 587                   if (dump_enabled_p ())
 588                     dump_printf_loc (MSG_NOTE, vect_location,
 589                                      "Detected reduction.\n");
 590
 591                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 592                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 593                                                            vect_reduction_def;
 594                   /* Store the reduction cycles for possible vectorization in
 595                      loop-aware SLP if it was not detected as reduction
 596                      chain.  */
 597                   if (! REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (reduc_stmt)))
 598                     LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
 599                 }
 600             }
 601         }
 602       else
 603         if (dump_enabled_p ())
 604           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 605                            "Unknown def-use cycle pattern.\n");
 606     }
 607 }
 608
 609
 610 /* Function vect_analyze_scalar_cycles.
 611
 612    Examine the cross iteration def-use cycles of scalar variables, by
 613    analyzing the loop-header PHIs of scalar variables.  Classify each
 614    cycle as one of the following: invariant, induction, reduction, unknown.
 615    We do that for the loop represented by LOOP_VINFO, and also to its
 616    inner-loop, if exists.
 617    Examples for scalar cycles:
 618
 619    Example1: reduction:
 620
 621               loop1:
 622               for (i=0; i<N; i++)
 623                  sum += a[i];
 624
 625    Example2: induction:
 626
 627               loop2:
 628               for (i=0; i<N; i++)
 629                  a[i] = i;  */
 630
 631 static void
 632 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 633 {
 634   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 635
 636   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 637
 638   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 639      Reductions in such inner-loop therefore have different properties than
 640      the reductions in the nest that gets vectorized:
 641      1. When vectorized, they are executed in the same order as in the original
 642         scalar loop, so we can't change the order of computation when
 643         vectorizing them.
 644      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 645         current checks are too strict.  */
 646
 647   if (loop->inner)
 648     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 649 }
 650
 651 /* Transfer group and reduction information from STMT to its pattern stmt.  */
 652
 653 static void
 654 vect_fixup_reduc_chain (gimple *stmt)
 655 {
 656   gimple *firstp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 657   gimple *stmtp;
 658   gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (firstp))
 659               && REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
 660   REDUC_GROUP_SIZE (vinfo_for_stmt (firstp))
 661     = REDUC_GROUP_SIZE (vinfo_for_stmt (stmt));
 662   do
 663     {
 664       stmtp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 665       REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmtp)) = firstp;
 666       stmt = REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
 667       if (stmt)
 668         REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmtp))
 669           = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 670     }
 671   while (stmt);
 672   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmtp)) = vect_reduction_def;
 673 }
 674
 675 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 676
 677 static void
 678 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 679 {
 680   gimple *first;
 681   unsigned i;
 682
 683   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 684     if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (first)))
 685       {
 686         gimple *next = REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
 687         while (next)
 688           {
 689             if (! STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next)))
 690               break;
 691             next = REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
 692           }
 693         /* If not all stmt in the chain are patterns try to handle
 694            the chain without patterns.  */
 695         if (! next)
 696           {
 697             vect_fixup_reduc_chain (first);
 698             LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
 699               = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first));
 700           }
 701       }
 702 }
 703
 704 /* Function vect_get_loop_niters.
 705
 706    Determine how many iterations the loop is executed and place it
 707    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
 708    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
 709    niter information holds in ASSUMPTIONS.
 710
 711    Return the loop exit condition.  */
 712
 713
 714 static gcond *
 715 vect_get_loop_niters (struct loop *loop, tree *assumptions,
 716                       tree *number_of_iterations, tree *number_of_iterationsm1)
 717 {
 718   edge exit = single_exit (loop);
 719   struct tree_niter_desc niter_desc;
 720   tree niter_assumptions, niter, may_be_zero;
 721   gcond *cond = get_loop_exit_condition (loop);
 722
 723   *assumptions = boolean_true_node;
 724   *number_of_iterationsm1 = chrec_dont_know;
 725   *number_of_iterations = chrec_dont_know;
 726   DUMP_VECT_SCOPE ("get_loop_niters");
 727
 728   if (!exit)
 729     return cond;
 730
 731   niter = chrec_dont_know;
 732   may_be_zero = NULL_TREE;
 733   niter_assumptions = boolean_true_node;
 734   if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
 735       || chrec_contains_undetermined (niter_desc.niter))
 736     return cond;
 737
 738   niter_assumptions = niter_desc.assumptions;
 739   may_be_zero = niter_desc.may_be_zero;
 740   niter = niter_desc.niter;
 741
 742   if (may_be_zero && integer_zerop (may_be_zero))
 743     may_be_zero = NULL_TREE;
 744
 745   if (may_be_zero)
 746     {
 747       if (COMPARISON_CLASS_P (may_be_zero))
 748         {
 749           /* Try to combine may_be_zero with assumptions, this can simplify
 750              computation of niter expression.  */
 751           if (niter_assumptions && !integer_nonzerop (niter_assumptions))
 752             niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
 753                                              niter_assumptions,
 754                                              fold_build1 (TRUTH_NOT_EXPR,
 755                                                           boolean_type_node,
 756                                                           may_be_zero));
 757           else
 758             niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
 759                                  build_int_cst (TREE_TYPE (niter), 0),
 760                                  rewrite_to_non_trapping_overflow (niter));
 761
 762           may_be_zero = NULL_TREE;
 763         }
 764       else if (integer_nonzerop (may_be_zero))
 765         {
 766           *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
 767           *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
 768           return cond;
 769         }
 770       else
 771         return cond;
 772     }
 773
 774   *assumptions = niter_assumptions;
 775   *number_of_iterationsm1 = niter;
 776
 777   /* We want the number of loop header executions which is the number
 778      of latch executions plus one.
 779      ???  For UINT_MAX latch executions this number overflows to zero
 780      for loops like do { n++; } while (n != 0);  */
 781   if (niter && !chrec_contains_undetermined (niter))
 782     niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
 783                           build_int_cst (TREE_TYPE (niter), 1));
 784   *number_of_iterations = niter;
 785
 786   return cond;
 787 }
 788
 789 /* Function bb_in_loop_p
 790
 791    Used as predicate for dfs order traversal of the loop bbs.  */
 792
 793 static bool
 794 bb_in_loop_p (const_basic_block bb, const void *data)
 795 {
 796   const struct loop *const loop = (const struct loop *)data;
 797   if (flow_bb_inside_loop_p (loop, bb))
 798     return true;
 799   return false;
 800 }
 801
 802
 803 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
 804    stmt_vec_info structs for all the stmts in LOOP_IN.  */
 805
 806 _loop_vec_info::_loop_vec_info (struct loop *loop_in, vec_info_shared *shared)
 807   : vec_info (vec_info::loop, init_cost (loop_in), shared),
 808     loop (loop_in),
 809     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
 810     num_itersm1 (NULL_TREE),
 811     num_iters (NULL_TREE),
 812     num_iters_unchanged (NULL_TREE),
 813     num_iters_assumptions (NULL_TREE),
 814     th (0),
 815     versioning_threshold (0),
 816     vectorization_factor (0),
 817     max_vectorization_factor (0),
 818     mask_skip_niters (NULL_TREE),
 819     mask_compare_type (NULL_TREE),
 820     unaligned_dr (NULL),
 821     peeling_for_alignment (0),
 822     ptr_mask (0),
 823     ivexpr_map (NULL),
 824     slp_unrolling_factor (1),
 825     single_scalar_iteration_cost (0),
 826     vectorizable (false),
 827     can_fully_mask_p (true),
 828     fully_masked_p (false),
 829     peeling_for_gaps (false),
 830     peeling_for_niter (false),
 831     operands_swapped (false),
 832     no_data_dependencies (false),
 833     has_mask_store (false),
 834     scalar_loop (NULL),
 835     orig_loop_info (NULL)
 836 {
 837   /* Create/Update stmt_info for all stmts in the loop.  */
 838   basic_block *body = get_loop_body (loop);
 839   for (unsigned int i = 0; i < loop->num_nodes; i++)
 840     {
 841       basic_block bb = body[i];
 842       gimple_stmt_iterator si;
 843
 844       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 845         {
 846           gimple *phi = gsi_stmt (si);
 847           gimple_set_uid (phi, 0);
 848           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, this));
 849         }
 850
 851       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 852         {
 853           gimple *stmt = gsi_stmt (si);
 854           gimple_set_uid (stmt, 0);
 855           set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, this));
 856         }
 857     }
 858   free (body);
 859
 860   /* CHECKME: We want to visit all BBs before their successors (except for
 861      latch blocks, for which this assertion wouldn't hold).  In the simple
 862      case of the loop forms we allow, a dfs order of the BBs would the same
 863      as reversed postorder traversal, so we are safe.  */
 864
 865   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 866                                           bbs, loop->num_nodes, loop);
 867   gcc_assert (nbbs == loop->num_nodes);
 868 }
 869
 870 /* Free all levels of MASKS.  */
 871
 872 void
 873 release_vec_loop_masks (vec_loop_masks *masks)
 874 {
 875   rgroup_masks *rgm;
 876   unsigned int i;
 877   FOR_EACH_VEC_ELT (*masks, i, rgm)
 878     rgm->masks.release ();
 879   masks->release ();
 880 }
 881
 882 /* Free all memory used by the _loop_vec_info, as well as all the
 883    stmt_vec_info structs of all the stmts in the loop.  */
 884
 885 _loop_vec_info::~_loop_vec_info ()
 886 {
 887   int nbbs;
 888   gimple_stmt_iterator si;
 889   int j;
 890
 891   /* ???  We're releasing loop_vinfos en-block.  */
 892   set_stmt_vec_info_vec (&stmt_vec_infos);
 893   nbbs = loop->num_nodes;
 894   for (j = 0; j < nbbs; j++)
 895     {
 896       basic_block bb = bbs[j];
 897       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 898         free_stmt_vec_info (gsi_stmt (si));
 899
 900       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
 901         {
 902           gimple *stmt = gsi_stmt (si);
 903
 904           /* We may have broken canonical form by moving a constant
 905              into RHS1 of a commutative op.  Fix such occurrences.  */
 906           if (operands_swapped && is_gimple_assign (stmt))
 907             {
 908               enum tree_code code = gimple_assign_rhs_code (stmt);
 909
 910               if ((code == PLUS_EXPR
 911                    || code == POINTER_PLUS_EXPR
 912                    || code == MULT_EXPR)
 913                   && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
 914                 swap_ssa_operands (stmt,
 915                                    gimple_assign_rhs1_ptr (stmt),
 916                                    gimple_assign_rhs2_ptr (stmt));
 917               else if (code == COND_EXPR
 918                        && CONSTANT_CLASS_P (gimple_assign_rhs2 (stmt)))
 919                 {
 920                   tree cond_expr = gimple_assign_rhs1 (stmt);
 921                   enum tree_code cond_code = TREE_CODE (cond_expr);
 922
 923                   if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
 924                     {
 925                       bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr,
 926                                                                   0));
 927                       cond_code = invert_tree_comparison (cond_code,
 928                                                           honor_nans);
 929                       if (cond_code != ERROR_MARK)
 930                         {
 931                           TREE_SET_CODE (cond_expr, cond_code);
 932                           swap_ssa_operands (stmt,
 933                                              gimple_assign_rhs2_ptr (stmt),
 934                                              gimple_assign_rhs3_ptr (stmt));
 935                         }
 936                     }
 937                 }
 938             }
 939
 940           /* Free stmt_vec_info.  */
 941           free_stmt_vec_info (stmt);
 942           gsi_next (&si);
 943         }
 944     }
 945
 946   free (bbs);
 947
 948   release_vec_loop_masks (&masks);
 949   delete ivexpr_map;
 950
 951   loop->aux = NULL;
 952 }
 953
 954 /* Return an invariant or register for EXPR and emit necessary
 955    computations in the LOOP_VINFO loop preheader.  */
 956
 957 tree
 958 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
 959 {
 960   if (is_gimple_reg (expr)
 961       || is_gimple_min_invariant (expr))
 962     return expr;
 963
 964   if (! loop_vinfo->ivexpr_map)
 965     loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
 966   tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
 967   if (! cached)
 968     {
 969       gimple_seq stmts = NULL;
 970       cached = force_gimple_operand (unshare_expr (expr),
 971                                      &stmts, true, NULL_TREE);
 972       if (stmts)
 973         {
 974           edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
 975           gsi_insert_seq_on_edge_immediate (e, stmts);
 976         }
 977     }
 978   return cached;
 979 }
 980
 981 /* Return true if we can use CMP_TYPE as the comparison type to produce
 982    all masks required to mask LOOP_VINFO.  */
 983
 984 static bool
 985 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
 986 {
 987   rgroup_masks *rgm;
 988   unsigned int i;
 989   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
 990     if (rgm->mask_type != NULL_TREE
 991         && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
 992                                             cmp_type, rgm->mask_type,
 993                                             OPTIMIZE_FOR_SPEED))
 994       return false;
 995   return true;
 996 }
 997
 998 /* Calculate the maximum number of scalars per iteration for every
 999    rgroup in LOOP_VINFO.  */
1000
1001 static unsigned int
1002 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1003 {
1004   unsigned int res = 1;
1005   unsigned int i;
1006   rgroup_masks *rgm;
1007   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1008     res = MAX (res, rgm->max_nscalars_per_iter);
1009   return res;
1010 }
1011
1012 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1013    whether we can actually generate the masks required.  Return true if so,
1014    storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE.  */
1015
1016 static bool
1017 vect_verify_full_masking (loop_vec_info loop_vinfo)
1018 {
1019   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1020   unsigned int min_ni_width;
1021
1022   /* Use a normal loop if there are no statements that need masking.
1023      This only happens in rare degenerate cases: it means that the loop
1024      has no loads, no stores, and no live-out values.  */
1025   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1026     return false;
1027
1028   /* Get the maximum number of iterations that is representable
1029      in the counter type.  */
1030   tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1031   widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1032
1033   /* Get a more refined estimate for the number of iterations.  */
1034   widest_int max_back_edges;
1035   if (max_loop_iterations (loop, &max_back_edges))
1036     max_ni = wi::smin (max_ni, max_back_edges + 1);
1037
1038   /* Account for rgroup masks, in which each bit is replicated N times.  */
1039   max_ni *= vect_get_max_nscalars_per_iter (loop_vinfo);
1040
1041   /* Work out how many bits we need to represent the limit.  */
1042   min_ni_width = wi::min_precision (max_ni, UNSIGNED);
1043
1044   /* Find a scalar mode for which WHILE_ULT is supported.  */
1045   opt_scalar_int_mode cmp_mode_iter;
1046   tree cmp_type = NULL_TREE;
1047   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1048     {
1049       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1050       if (cmp_bits >= min_ni_width
1051           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1052         {
1053           tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1054           if (this_type
1055               && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1056             {
1057               /* Although we could stop as soon as we find a valid mode,
1058                  it's often better to continue until we hit Pmode, since the
1059                  operands to the WHILE are more likely to be reusable in
1060                  address calculations.  */
1061               cmp_type = this_type;
1062               if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1063                 break;
1064             }
1065         }
1066     }
1067
1068   if (!cmp_type)
1069     return false;
1070
1071   LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
1072   return true;
1073 }
1074
1075 /* Calculate the cost of one scalar iteration of the loop.  */
1076 static void
1077 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1078 {
1079   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1080   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1081   int nbbs = loop->num_nodes, factor;
1082   int innerloop_iters, i;
1083
1084   /* Gather costs for statements in the scalar loop.  */
1085
1086   /* FORNOW.  */
1087   innerloop_iters = 1;
1088   if (loop->inner)
1089     innerloop_iters = 50; /* FIXME */
1090
1091   for (i = 0; i < nbbs; i++)
1092     {
1093       gimple_stmt_iterator si;
1094       basic_block bb = bbs[i];
1095
1096       if (bb->loop_father == loop->inner)
1097         factor = innerloop_iters;
1098       else
1099         factor = 1;
1100
1101       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1102         {
1103           gimple *stmt = gsi_stmt (si);
1104           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1105
1106           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1107             continue;
1108
1109           /* Skip stmts that are not vectorized inside the loop.  */
1110           if (stmt_info
1111               && !STMT_VINFO_RELEVANT_P (stmt_info)
1112               && (!STMT_VINFO_LIVE_P (stmt_info)
1113                   || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1114               && !STMT_VINFO_IN_PATTERN_P (stmt_info))
1115             continue;
1116
1117           vect_cost_for_stmt kind;
1118           if (STMT_VINFO_DATA_REF (stmt_info))
1119             {
1120               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1121                kind = scalar_load;
1122              else
1123                kind = scalar_store;
1124             }
1125           else
1126             kind = scalar_stmt;
1127
1128           record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1129                             factor, kind, stmt_info, 0, vect_prologue);
1130         }
1131     }
1132
1133   /* Now accumulate cost.  */
1134   void *target_cost_data = init_cost (loop);
1135   stmt_info_for_cost *si;
1136   int j;
1137   FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1138                     j, si)
1139     {
1140       struct _stmt_vec_info *stmt_info
1141         = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
1142       (void) add_stmt_cost (target_cost_data, si->count,
1143                             si->kind, stmt_info, si->misalign,
1144                             vect_body);
1145     }
1146   unsigned dummy, body_cost = 0;
1147   finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
1148   destroy_cost_data (target_cost_data);
1149   LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
1150 }
1151
1152
1153 /* Function vect_analyze_loop_form_1.
1154
1155    Verify that certain CFG restrictions hold, including:
1156    - the loop has a pre-header
1157    - the loop has a single entry and exit
1158    - the loop exit condition is simple enough
1159    - the number of iterations can be analyzed, i.e, a countable loop.  The
1160      niter could be analyzed under some assumptions.  */
1161
1162 bool
1163 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1164                           tree *assumptions, tree *number_of_iterationsm1,
1165                           tree *number_of_iterations, gcond **inner_loop_cond)
1166 {
1167   DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1168
1169   /* Different restrictions apply when we are considering an inner-most loop,
1170      vs. an outer (nested) loop.
1171      (FORNOW. May want to relax some of these restrictions in the future).  */
1172
1173   if (!loop->inner)
1174     {
1175       /* Inner-most loop.  We currently require that the number of BBs is
1176          exactly 2 (the header and latch).  Vectorizable inner-most loops
1177          look like this:
1178
1179                         (pre-header)
1180                            |
1181                           header <--------+
1182                            | |            |
1183                            | +--> latch --+
1184                            |
1185                         (exit-bb)  */
1186
1187       if (loop->num_nodes != 2)
1188         {
1189           if (dump_enabled_p ())
1190             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1191                              "not vectorized: control flow in loop.\n");
1192           return false;
1193         }
1194
1195       if (empty_block_p (loop->header))
1196         {
1197           if (dump_enabled_p ())
1198             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1199                              "not vectorized: empty loop.\n");
1200           return false;
1201         }
1202     }
1203   else
1204     {
1205       struct loop *innerloop = loop->inner;
1206       edge entryedge;
1207
1208       /* Nested loop. We currently require that the loop is doubly-nested,
1209          contains a single inner loop, and the number of BBs is exactly 5.
1210          Vectorizable outer-loops look like this:
1211
1212                         (pre-header)
1213                            |
1214                           header <---+
1215                            |         |
1216                           inner-loop |
1217                            |         |
1218                           tail ------+
1219                            |
1220                         (exit-bb)
1221
1222          The inner-loop has the properties expected of inner-most loops
1223          as described above.  */
1224
1225       if ((loop->inner)->inner || (loop->inner)->next)
1226         {
1227           if (dump_enabled_p ())
1228             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1229                              "not vectorized: multiple nested loops.\n");
1230           return false;
1231         }
1232
1233       if (loop->num_nodes != 5)
1234         {
1235           if (dump_enabled_p ())
1236             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1237                              "not vectorized: control flow in loop.\n");
1238           return false;
1239         }
1240
1241       entryedge = loop_preheader_edge (innerloop);
1242       if (entryedge->src != loop->header
1243           || !single_exit (innerloop)
1244           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1245         {
1246           if (dump_enabled_p ())
1247             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1248                              "not vectorized: unsupported outerloop form.\n");
1249           return false;
1250         }
1251
1252       /* Analyze the inner-loop.  */
1253       tree inner_niterm1, inner_niter, inner_assumptions;
1254       if (! vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1255                                       &inner_assumptions, &inner_niterm1,
1256                                       &inner_niter, NULL)
1257           /* Don't support analyzing niter under assumptions for inner
1258              loop.  */
1259           || !integer_onep (inner_assumptions))
1260         {
1261           if (dump_enabled_p ())
1262             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1263                              "not vectorized: Bad inner loop.\n");
1264           return false;
1265         }
1266
1267       if (!expr_invariant_in_loop_p (loop, inner_niter))
1268         {
1269           if (dump_enabled_p ())
1270             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1271                              "not vectorized: inner-loop count not"
1272                              " invariant.\n");
1273           return false;
1274         }
1275
1276       if (dump_enabled_p ())
1277         dump_printf_loc (MSG_NOTE, vect_location,
1278                          "Considering outer-loop vectorization.\n");
1279     }
1280
1281   if (!single_exit (loop)
1282       || EDGE_COUNT (loop->header->preds) != 2)
1283     {
1284       if (dump_enabled_p ())
1285         {
1286           if (!single_exit (loop))
1287             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1288                              "not vectorized: multiple exits.\n");
1289           else if (EDGE_COUNT (loop->header->preds) != 2)
1290             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1291                              "not vectorized: too many incoming edges.\n");
1292         }
1293       return false;
1294     }
1295
1296   /* We assume that the loop exit condition is at the end of the loop. i.e,
1297      that the loop is represented as a do-while (with a proper if-guard
1298      before the loop if needed), where the loop header contains all the
1299      executable statements, and the latch is empty.  */
1300   if (!empty_block_p (loop->latch)
1301       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1302     {
1303       if (dump_enabled_p ())
1304         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1305                          "not vectorized: latch block not empty.\n");
1306       return false;
1307     }
1308
1309   /* Make sure the exit is not abnormal.  */
1310   edge e = single_exit (loop);
1311   if (e->flags & EDGE_ABNORMAL)
1312     {
1313       if (dump_enabled_p ())
1314         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1315                          "not vectorized: abnormal loop exit edge.\n");
1316       return false;
1317     }
1318
1319   *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1320                                      number_of_iterationsm1);
1321   if (!*loop_cond)
1322     {
1323       if (dump_enabled_p ())
1324         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1325                          "not vectorized: complicated exit condition.\n");
1326       return false;
1327     }
1328
1329   if (integer_zerop (*assumptions)
1330       || !*number_of_iterations
1331       || chrec_contains_undetermined (*number_of_iterations))
1332     {
1333       if (dump_enabled_p ())
1334         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1335                          "not vectorized: number of iterations cannot be "
1336                          "computed.\n");
1337       return false;
1338     }
1339
1340   if (integer_zerop (*number_of_iterations))
1341     {
1342       if (dump_enabled_p ())
1343         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1344                          "not vectorized: number of iterations = 0.\n");
1345       return false;
1346     }
1347
1348   return true;
1349 }
1350
1351 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form.  */
1352
1353 loop_vec_info
1354 vect_analyze_loop_form (struct loop *loop, vec_info_shared *shared)
1355 {
1356   tree assumptions, number_of_iterations, number_of_iterationsm1;
1357   gcond *loop_cond, *inner_loop_cond = NULL;
1358
1359   if (! vect_analyze_loop_form_1 (loop, &loop_cond,
1360                                   &assumptions, &number_of_iterationsm1,
1361                                   &number_of_iterations, &inner_loop_cond))
1362     return NULL;
1363
1364   loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1365   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1366   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1367   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1368   if (!integer_onep (assumptions))
1369     {
1370       /* We consider to vectorize this loop by versioning it under
1371          some assumptions.  In order to do this, we need to clear
1372          existing information computed by scev and niter analyzer.  */
1373       scev_reset_htab ();
1374       free_numbers_of_iterations_estimates (loop);
1375       /* Also set flag for this loop so that following scev and niter
1376          analysis are done under the assumptions.  */
1377       loop_constraint_set (loop, LOOP_C_FINITE);
1378       /* Also record the assumptions for versioning.  */
1379       LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1380     }
1381
1382   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1383     {
1384       if (dump_enabled_p ())
1385         {
1386           dump_printf_loc (MSG_NOTE, vect_location,
1387                            "Symbolic number of iterations is ");
1388           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1389           dump_printf (MSG_NOTE, "\n");
1390         }
1391     }
1392
1393   STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1394   if (inner_loop_cond)
1395     STMT_VINFO_TYPE (vinfo_for_stmt (inner_loop_cond))
1396       = loop_exit_ctrl_vec_info_type;
1397
1398   gcc_assert (!loop->aux);
1399   loop->aux = loop_vinfo;
1400   return loop_vinfo;
1401 }
1402
1403
1404
1405 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1406    statements update the vectorization factor.  */
1407
1408 static void
1409 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1410 {
1411   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1412   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1413   int nbbs = loop->num_nodes;
1414   poly_uint64 vectorization_factor;
1415   int i;
1416
1417   DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1418
1419   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1420   gcc_assert (known_ne (vectorization_factor, 0U));
1421
1422   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1423      vectorization factor of the loop is the unrolling factor required by
1424      the SLP instances.  If that unrolling factor is 1, we say, that we
1425      perform pure SLP on loop - cross iteration parallelism is not
1426      exploited.  */
1427   bool only_slp_in_loop = true;
1428   for (i = 0; i < nbbs; i++)
1429     {
1430       basic_block bb = bbs[i];
1431       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1432            gsi_next (&si))
1433         {
1434           gimple *stmt = gsi_stmt (si);
1435           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1436           if (STMT_VINFO_IN_PATTERN_P (stmt_info)
1437               && STMT_VINFO_RELATED_STMT (stmt_info))
1438             {
1439               stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1440               stmt_info = vinfo_for_stmt (stmt);
1441             }
1442           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1443                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1444               && !PURE_SLP_STMT (stmt_info))
1445             /* STMT needs both SLP and loop-based vectorization.  */
1446             only_slp_in_loop = false;
1447         }
1448     }
1449
1450   if (only_slp_in_loop)
1451     {
1452       dump_printf_loc (MSG_NOTE, vect_location,
1453                        "Loop contains only SLP stmts\n");
1454       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1455     }
1456   else
1457     {
1458       dump_printf_loc (MSG_NOTE, vect_location,
1459                        "Loop contains SLP and non-SLP stmts\n");
1460       /* Both the vectorization factor and unroll factor have the form
1461          current_vector_size * X for some rational X, so they must have
1462          a common multiple.  */
1463       vectorization_factor
1464         = force_common_multiple (vectorization_factor,
1465                                  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1466     }
1467
1468   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1469   if (dump_enabled_p ())
1470     {
1471       dump_printf_loc (MSG_NOTE, vect_location,
1472                        "Updating vectorization factor to ");
1473       dump_dec (MSG_NOTE, vectorization_factor);
1474       dump_printf (MSG_NOTE, ".\n");
1475     }
1476 }
1477
1478 /* Return true if STMT_INFO describes a double reduction phi and if
1479    the other phi in the reduction is also relevant for vectorization.
1480    This rejects cases such as:
1481
1482       outer1:
1483         x_1 = PHI <x_3(outer2), ...>;
1484         ...
1485
1486       inner:
1487         x_2 = ...;
1488         ...
1489
1490       outer2:
1491         x_3 = PHI <x_2(inner)>;
1492
1493    if nothing in x_2 or elsewhere makes x_1 relevant.  */
1494
1495 static bool
1496 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1497 {
1498   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1499     return false;
1500
1501   gimple *other_phi = STMT_VINFO_REDUC_DEF (stmt_info);
1502   return STMT_VINFO_RELEVANT_P (vinfo_for_stmt (other_phi));
1503 }
1504
1505 /* Function vect_analyze_loop_operations.
1506
1507    Scan the loop stmts and make sure they are all vectorizable.  */
1508
1509 static bool
1510 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1511 {
1512   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1513   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1514   int nbbs = loop->num_nodes;
1515   int i;
1516   stmt_vec_info stmt_info;
1517   bool need_to_vectorize = false;
1518   bool ok;
1519
1520   DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1521
1522   stmt_vector_for_cost cost_vec;
1523   cost_vec.create (2);
1524
1525   for (i = 0; i < nbbs; i++)
1526     {
1527       basic_block bb = bbs[i];
1528
1529       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1530            gsi_next (&si))
1531         {
1532           gphi *phi = si.phi ();
1533           ok = true;
1534
1535           stmt_info = vinfo_for_stmt (phi);
1536           if (dump_enabled_p ())
1537             {
1538               dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1539               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1540             }
1541           if (virtual_operand_p (gimple_phi_result (phi)))
1542             continue;
1543
1544           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1545              (i.e., a phi in the tail of the outer-loop).  */
1546           if (! is_loop_header_bb_p (bb))
1547             {
1548               /* FORNOW: we currently don't support the case that these phis
1549                  are not used in the outerloop (unless it is double reduction,
1550                  i.e., this phi is vect_reduction_def), cause this case
1551                  requires to actually do something here.  */
1552               if (STMT_VINFO_LIVE_P (stmt_info)
1553                   && !vect_active_double_reduction_p (stmt_info))
1554                 {
1555                   if (dump_enabled_p ())
1556                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1557                                      "Unsupported loop-closed phi in "
1558                                      "outer-loop.\n");
1559                   return false;
1560                 }
1561
1562               /* If PHI is used in the outer loop, we check that its operand
1563                  is defined in the inner loop.  */
1564               if (STMT_VINFO_RELEVANT_P (stmt_info))
1565                 {
1566                   tree phi_op;
1567                   gimple *op_def_stmt;
1568
1569                   if (gimple_phi_num_args (phi) != 1)
1570                     return false;
1571
1572                   phi_op = PHI_ARG_DEF (phi, 0);
1573                   if (TREE_CODE (phi_op) != SSA_NAME)
1574                     return false;
1575
1576                   op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1577                   if (gimple_nop_p (op_def_stmt)
1578                       || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1579                       || !vinfo_for_stmt (op_def_stmt))
1580                     return false;
1581
1582                   if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1583                         != vect_used_in_outer
1584                       && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1585                            != vect_used_in_outer_by_reduction)
1586                     return false;
1587                 }
1588
1589               continue;
1590             }
1591
1592           gcc_assert (stmt_info);
1593
1594           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1595                || STMT_VINFO_LIVE_P (stmt_info))
1596               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1597             {
1598               /* A scalar-dependence cycle that we don't support.  */
1599               if (dump_enabled_p ())
1600                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1601                                  "not vectorized: scalar dependence cycle.\n");
1602               return false;
1603             }
1604
1605           if (STMT_VINFO_RELEVANT_P (stmt_info))
1606             {
1607               need_to_vectorize = true;
1608               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1609                   && ! PURE_SLP_STMT (stmt_info))
1610                 ok = vectorizable_induction (phi, NULL, NULL, NULL, &cost_vec);
1611               else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1612                         || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1613                        && ! PURE_SLP_STMT (stmt_info))
1614                 ok = vectorizable_reduction (phi, NULL, NULL, NULL, NULL,
1615                                              &cost_vec);
1616             }
1617
1618           /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
1619           if (ok
1620               && STMT_VINFO_LIVE_P (stmt_info)
1621               && !PURE_SLP_STMT (stmt_info))
1622             ok = vectorizable_live_operation (phi, NULL, NULL, -1, NULL,
1623                                               &cost_vec);
1624
1625           if (!ok)
1626             {
1627               if (dump_enabled_p ())
1628                 {
1629                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1630                                    "not vectorized: relevant phi not "
1631                                    "supported: ");
1632                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1633                 }
1634               return false;
1635             }
1636         }
1637
1638       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1639            gsi_next (&si))
1640         {
1641           gimple *stmt = gsi_stmt (si);
1642           if (!gimple_clobber_p (stmt)
1643               && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL, NULL,
1644                                      &cost_vec))
1645             return false;
1646         }
1647     } /* bbs */
1648
1649   add_stmt_costs (loop_vinfo->target_cost_data, &cost_vec);
1650   cost_vec.release ();
1651
1652   /* All operations in the loop are either irrelevant (deal with loop
1653      control, or dead), or only used outside the loop and can be moved
1654      out of the loop (e.g. invariants, inductions).  The loop can be
1655      optimized away by scalar optimizations.  We're better off not
1656      touching this loop.  */
1657   if (!need_to_vectorize)
1658     {
1659       if (dump_enabled_p ())
1660         dump_printf_loc (MSG_NOTE, vect_location,
1661                          "All the computation can be taken out of the loop.\n");
1662       if (dump_enabled_p ())
1663         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1664                          "not vectorized: redundant loop. no profit to "
1665                          "vectorize.\n");
1666       return false;
1667     }
1668
1669   return true;
1670 }
1671
1672 /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
1673    is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
1674    definitely no, or -1 if it's worth retrying.  */
1675
1676 static int
1677 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1678 {
1679   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1680   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1681
1682   /* Only fully-masked loops can have iteration counts less than the
1683      vectorization factor.  */
1684   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1685     {
1686       HOST_WIDE_INT max_niter;
1687
1688       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1689         max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1690       else
1691         max_niter = max_stmt_executions_int (loop);
1692
1693       if (max_niter != -1
1694           && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1695         {
1696           if (dump_enabled_p ())
1697             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1698                              "not vectorized: iteration count smaller than "
1699                              "vectorization factor.\n");
1700           return 0;
1701         }
1702     }
1703
1704   int min_profitable_iters, min_profitable_estimate;
1705   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1706                                       &min_profitable_estimate);
1707
1708   if (min_profitable_iters < 0)
1709     {
1710       if (dump_enabled_p ())
1711         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1712                          "not vectorized: vectorization not profitable.\n");
1713       if (dump_enabled_p ())
1714         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1715                          "not vectorized: vector version will never be "
1716                          "profitable.\n");
1717       return -1;
1718     }
1719
1720   int min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1721                                * assumed_vf);
1722
1723   /* Use the cost model only if it is more conservative than user specified
1724      threshold.  */
1725   unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1726                                     min_profitable_iters);
1727
1728   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1729
1730   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1731       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1732     {
1733       if (dump_enabled_p ())
1734         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1735                          "not vectorized: vectorization not profitable.\n");
1736       if (dump_enabled_p ())
1737         dump_printf_loc (MSG_NOTE, vect_location,
1738                          "not vectorized: iteration count smaller than user "
1739                          "specified loop bound parameter or minimum profitable "
1740                          "iterations (whichever is more conservative).\n");
1741       return 0;
1742     }
1743
1744   HOST_WIDE_INT estimated_niter = estimated_stmt_executions_int (loop);
1745   if (estimated_niter == -1)
1746     estimated_niter = likely_max_stmt_executions_int (loop);
1747   if (estimated_niter != -1
1748       && ((unsigned HOST_WIDE_INT) estimated_niter
1749           < MAX (th, (unsigned) min_profitable_estimate)))
1750     {
1751       if (dump_enabled_p ())
1752         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1753                          "not vectorized: estimated iteration count too "
1754                          "small.\n");
1755       if (dump_enabled_p ())
1756         dump_printf_loc (MSG_NOTE, vect_location,
1757                          "not vectorized: estimated iteration count smaller "
1758                          "than specified loop bound parameter or minimum "
1759                          "profitable iterations (whichever is more "
1760                          "conservative).\n");
1761       return -1;
1762     }
1763
1764   return 1;
1765 }
1766
1767 static bool
1768 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1769                            vec<data_reference_p> *datarefs,
1770                            unsigned int *n_stmts)
1771 {
1772   *n_stmts = 0;
1773   for (unsigned i = 0; i < loop->num_nodes; i++)
1774     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1775          !gsi_end_p (gsi); gsi_next (&gsi))
1776       {
1777         gimple *stmt = gsi_stmt (gsi);
1778         if (is_gimple_debug (stmt))
1779           continue;
1780         ++(*n_stmts);
1781         if (!vect_find_stmt_data_reference (loop, stmt, datarefs))
1782           {
1783             if (is_gimple_call (stmt) && loop->safelen)
1784               {
1785                 tree fndecl = gimple_call_fndecl (stmt), op;
1786                 if (fndecl != NULL_TREE)
1787                   {
1788                     cgraph_node *node = cgraph_node::get (fndecl);
1789                     if (node != NULL && node->simd_clones != NULL)
1790                       {
1791                         unsigned int j, n = gimple_call_num_args (stmt);
1792                         for (j = 0; j < n; j++)
1793                           {
1794                             op = gimple_call_arg (stmt, j);
1795                             if (DECL_P (op)
1796                                 || (REFERENCE_CLASS_P (op)
1797                                     && get_base_address (op)))
1798                               break;
1799                           }
1800                         op = gimple_call_lhs (stmt);
1801                         /* Ignore #pragma omp declare simd functions
1802                            if they don't have data references in the
1803                            call stmt itself.  */
1804                         if (j == n
1805                             && !(op
1806                                  && (DECL_P (op)
1807                                      || (REFERENCE_CLASS_P (op)
1808                                          && get_base_address (op)))))
1809                           continue;
1810                       }
1811                   }
1812               }
1813             return false;
1814           }
1815       }
1816   return true;
1817 }
1818
1819 /* Function vect_analyze_loop_2.
1820
1821    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1822    for it.  The different analyses will record information in the
1823    loop_vec_info struct.  */
1824 static bool
1825 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
1826 {
1827   bool ok;
1828   int res;
1829   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
1830   poly_uint64 min_vf = 2;
1831
1832   /* The first group of checks is independent of the vector size.  */
1833   fatal = true;
1834
1835   /* Find all data references in the loop (which correspond to vdefs/vuses)
1836      and analyze their evolution in the loop.  */
1837
1838   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1839
1840   /* Gather the data references and count stmts in the loop.  */
1841   if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
1842     {
1843       if (!vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
1844                                       &LOOP_VINFO_DATAREFS (loop_vinfo),
1845                                       n_stmts))
1846         {
1847           if (dump_enabled_p ())
1848             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1849                              "not vectorized: loop contains function "
1850                              "calls or data references that cannot "
1851                              "be analyzed\n");
1852           return false;
1853         }
1854       loop_vinfo->shared->save_datarefs ();
1855     }
1856   else
1857     loop_vinfo->shared->check_datarefs ();
1858
1859   /* Analyze the data references and also adjust the minimal
1860      vectorization factor according to the loads and stores.  */
1861
1862   ok = vect_analyze_data_refs (loop_vinfo, &min_vf);
1863   if (!ok)
1864     {
1865       if (dump_enabled_p ())
1866         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1867                          "bad data references.\n");
1868       return false;
1869     }
1870
1871   /* Classify all cross-iteration scalar data-flow cycles.
1872      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1873   vect_analyze_scalar_cycles (loop_vinfo);
1874
1875   vect_pattern_recog (loop_vinfo);
1876
1877   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1878
1879   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1880      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1881
1882   ok = vect_analyze_data_ref_accesses (loop_vinfo);
1883   if (!ok)
1884     {
1885       if (dump_enabled_p ())
1886         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1887                          "bad data access.\n");
1888       return false;
1889     }
1890
1891   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1892
1893   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1894   if (!ok)
1895     {
1896       if (dump_enabled_p ())
1897         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1898                          "unexpected pattern.\n");
1899       return false;
1900     }
1901
1902   /* While the rest of the analysis below depends on it in some way.  */
1903   fatal = false;
1904
1905   /* Analyze data dependences between the data-refs in the loop
1906      and adjust the maximum vectorization factor according to
1907      the dependences.
1908      FORNOW: fail at the first data dependence that we encounter.  */
1909
1910   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1911   if (!ok
1912       || (max_vf != MAX_VECTORIZATION_FACTOR
1913           && maybe_lt (max_vf, min_vf)))
1914     {
1915       if (dump_enabled_p ())
1916             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1917                              "bad data dependence.\n");
1918       return false;
1919     }
1920   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
1921
1922   ok = vect_determine_vectorization_factor (loop_vinfo);
1923   if (!ok)
1924     {
1925       if (dump_enabled_p ())
1926         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1927                          "can't determine vectorization factor.\n");
1928       return false;
1929     }
1930   if (max_vf != MAX_VECTORIZATION_FACTOR
1931       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1932     {
1933       if (dump_enabled_p ())
1934         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1935                          "bad data dependence.\n");
1936       return false;
1937     }
1938
1939   /* Compute the scalar iteration cost.  */
1940   vect_compute_single_scalar_iteration_cost (loop_vinfo);
1941
1942   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1943   unsigned th;
1944
1945   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
1946   ok = vect_analyze_slp (loop_vinfo, *n_stmts);
1947   if (!ok)
1948     return false;
1949
1950   /* If there are any SLP instances mark them as pure_slp.  */
1951   bool slp = vect_make_slp_decision (loop_vinfo);
1952   if (slp)
1953     {
1954       /* Find stmts that need to be both vectorized and SLPed.  */
1955       vect_detect_hybrid_slp (loop_vinfo);
1956
1957       /* Update the vectorization factor based on the SLP decision.  */
1958       vect_update_vf_for_slp (loop_vinfo);
1959     }
1960
1961   bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo);
1962
1963   /* We don't expect to have to roll back to anything other than an empty
1964      set of rgroups.  */
1965   gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
1966
1967   /* This is the point where we can re-start analysis with SLP forced off.  */
1968 start_over:
1969
1970   /* Now the vectorization factor is final.  */
1971   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1972   gcc_assert (known_ne (vectorization_factor, 0U));
1973
1974   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1975     {
1976       dump_printf_loc (MSG_NOTE, vect_location,
1977                        "vectorization_factor = ");
1978       dump_dec (MSG_NOTE, vectorization_factor);
1979       dump_printf (MSG_NOTE, ", niters = " HOST_WIDE_INT_PRINT_DEC "\n",
1980                    LOOP_VINFO_INT_NITERS (loop_vinfo));
1981     }
1982
1983   HOST_WIDE_INT max_niter
1984     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1985
1986   /* Analyze the alignment of the data-refs in the loop.
1987      Fail if a data reference is found that cannot be vectorized.  */
1988
1989   ok = vect_analyze_data_refs_alignment (loop_vinfo);
1990   if (!ok)
1991     {
1992       if (dump_enabled_p ())
1993         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1994                          "bad data alignment.\n");
1995       return false;
1996     }
1997
1998   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1999      It is important to call pruning after vect_analyze_data_ref_accesses,
2000      since we use grouping information gathered by interleaving analysis.  */
2001   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2002   if (!ok)
2003     return false;
2004
2005   /* Do not invoke vect_enhance_data_refs_alignment for eplilogue
2006      vectorization.  */
2007   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2008     {
2009     /* This pass will decide on using loop versioning and/or loop peeling in
2010        order to enhance the alignment of data references in the loop.  */
2011     ok = vect_enhance_data_refs_alignment (loop_vinfo);
2012     if (!ok)
2013       {
2014         if (dump_enabled_p ())
2015           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2016                            "bad data alignment.\n");
2017         return false;
2018       }
2019     }
2020
2021   if (slp)
2022     {
2023       /* Analyze operations in the SLP instances.  Note this may
2024          remove unsupported SLP instances which makes the above
2025          SLP kind detection invalid.  */
2026       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2027       vect_slp_analyze_operations (loop_vinfo);
2028       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2029         goto again;
2030     }
2031
2032   /* Scan all the remaining operations in the loop that are not subject
2033      to SLP and make sure they are vectorizable.  */
2034   ok = vect_analyze_loop_operations (loop_vinfo);
2035   if (!ok)
2036     {
2037       if (dump_enabled_p ())
2038         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2039                          "bad operation or unsupported loop bound.\n");
2040       return false;
2041     }
2042
2043   /* Decide whether to use a fully-masked loop for this vectorization
2044      factor.  */
2045   LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2046     = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
2047        && vect_verify_full_masking (loop_vinfo));
2048   if (dump_enabled_p ())
2049     {
2050       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2051         dump_printf_loc (MSG_NOTE, vect_location,
2052                          "using a fully-masked loop.\n");
2053       else
2054         dump_printf_loc (MSG_NOTE, vect_location,
2055                          "not using a fully-masked loop.\n");
2056     }
2057
2058   /* If epilog loop is required because of data accesses with gaps,
2059      one additional iteration needs to be peeled.  Check if there is
2060      enough iterations for vectorization.  */
2061   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2062       && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2063       && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2064     {
2065       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2066       tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2067
2068       if (known_lt (wi::to_widest (scalar_niters), vf))
2069         {
2070           if (dump_enabled_p ())
2071             dump_printf_loc (MSG_NOTE, vect_location,
2072                              "loop has no enough iterations to support"
2073                              " peeling for gaps.\n");
2074           return false;
2075         }
2076     }
2077
2078   /* Check the costings of the loop make vectorizing worthwhile.  */
2079   res = vect_analyze_loop_costing (loop_vinfo);
2080   if (res < 0)
2081     goto again;
2082   if (!res)
2083     {
2084       if (dump_enabled_p ())
2085         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2086                          "Loop costings not worthwhile.\n");
2087       return false;
2088     }
2089
2090   /* Decide whether we need to create an epilogue loop to handle
2091      remaining scalar iterations.  */
2092   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2093
2094   unsigned HOST_WIDE_INT const_vf;
2095   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2096     /* The main loop handles all iterations.  */
2097     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2098   else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2099            && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
2100     {
2101       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo)
2102                        - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo),
2103                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2104         LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2105     }
2106   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2107            || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
2108            || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2109                 < (unsigned) exact_log2 (const_vf))
2110                /* In case of versioning, check if the maximum number of
2111                   iterations is greater than th.  If they are identical,
2112                   the epilogue is unnecessary.  */
2113                && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2114                    || ((unsigned HOST_WIDE_INT) max_niter
2115                        > (th / const_vf) * const_vf))))
2116     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2117
2118   /* If an epilogue loop is required make sure we can create one.  */
2119   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2120       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2121     {
2122       if (dump_enabled_p ())
2123         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2124       if (!vect_can_advance_ivs_p (loop_vinfo)
2125           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2126                                            single_exit (LOOP_VINFO_LOOP
2127                                                          (loop_vinfo))))
2128         {
2129           if (dump_enabled_p ())
2130             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2131                              "not vectorized: can't create required "
2132                              "epilog loop\n");
2133           goto again;
2134         }
2135     }
2136
2137   /* During peeling, we need to check if number of loop iterations is
2138      enough for both peeled prolog loop and vector loop.  This check
2139      can be merged along with threshold check of loop versioning, so
2140      increase threshold for this case if necessary.  */
2141   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
2142     {
2143       poly_uint64 niters_th = 0;
2144
2145       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2146         {
2147           /* Niters for peeled prolog loop.  */
2148           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2149             {
2150               struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2151               tree vectype
2152                 = STMT_VINFO_VECTYPE (vinfo_for_stmt (vect_dr_stmt (dr)));
2153               niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2154             }
2155           else
2156             niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2157         }
2158
2159       /* Niters for at least one iteration of vectorized loop.  */
2160       if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2161         niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2162       /* One additional iteration because of peeling for gap.  */
2163       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2164         niters_th += 1;
2165       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2166     }
2167
2168   gcc_assert (known_eq (vectorization_factor,
2169                         LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2170
2171   /* Ok to vectorize!  */
2172   return true;
2173
2174 again:
2175   /* Try again with SLP forced off but if we didn't do any SLP there is
2176      no point in re-trying.  */
2177   if (!slp)
2178     return false;
2179
2180   /* If there are reduction chains re-trying will fail anyway.  */
2181   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2182     return false;
2183
2184   /* Likewise if the grouped loads or stores in the SLP cannot be handled
2185      via interleaving or lane instructions.  */
2186   slp_instance instance;
2187   slp_tree node;
2188   unsigned i, j;
2189   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2190     {
2191       stmt_vec_info vinfo;
2192       vinfo = vinfo_for_stmt
2193           (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0]);
2194       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2195         continue;
2196       vinfo = vinfo_for_stmt (DR_GROUP_FIRST_ELEMENT (vinfo));
2197       unsigned int size = DR_GROUP_SIZE (vinfo);
2198       tree vectype = STMT_VINFO_VECTYPE (vinfo);
2199       if (! vect_store_lanes_supported (vectype, size, false)
2200          && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2201          && ! vect_grouped_store_supported (vectype, size))
2202        return false;
2203       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2204         {
2205           vinfo = vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]);
2206           vinfo = vinfo_for_stmt (DR_GROUP_FIRST_ELEMENT (vinfo));
2207           bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2208           size = DR_GROUP_SIZE (vinfo);
2209           vectype = STMT_VINFO_VECTYPE (vinfo);
2210           if (! vect_load_lanes_supported (vectype, size, false)
2211               && ! vect_grouped_load_supported (vectype, single_element_p,
2212                                                 size))
2213             return false;
2214         }
2215     }
2216
2217   if (dump_enabled_p ())
2218     dump_printf_loc (MSG_NOTE, vect_location,
2219                      "re-trying with SLP disabled\n");
2220
2221   /* Roll back state appropriately.  No SLP this time.  */
2222   slp = false;
2223   /* Restore vectorization factor as it were without SLP.  */
2224   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2225   /* Free the SLP instances.  */
2226   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2227     vect_free_slp_instance (instance);
2228   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2229   /* Reset SLP type to loop_vect on all stmts.  */
2230   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2231     {
2232       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2233       for (gimple_stmt_iterator si = gsi_start_phis (bb);
2234            !gsi_end_p (si); gsi_next (&si))
2235         {
2236           stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2237           STMT_SLP_TYPE (stmt_info) = loop_vect;
2238         }
2239       for (gimple_stmt_iterator si = gsi_start_bb (bb);
2240            !gsi_end_p (si); gsi_next (&si))
2241         {
2242           stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2243           STMT_SLP_TYPE (stmt_info) = loop_vect;
2244           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2245             {
2246               gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2247               stmt_info = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
2248               STMT_SLP_TYPE (stmt_info) = loop_vect;
2249               for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2250                    !gsi_end_p (pi); gsi_next (&pi))
2251                 {
2252                   gimple *pstmt = gsi_stmt (pi);
2253                   STMT_SLP_TYPE (vinfo_for_stmt (pstmt)) = loop_vect;
2254                 }
2255             }
2256         }
2257     }
2258   /* Free optimized alias test DDRS.  */
2259   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2260   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2261   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2262   /* Reset target cost data.  */
2263   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2264   LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2265     = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2266   /* Reset accumulated rgroup information.  */
2267   release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo));
2268   /* Reset assorted flags.  */
2269   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2270   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2271   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2272   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2273   LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p;
2274
2275   goto start_over;
2276 }
2277
2278 /* Function vect_analyze_loop.
2279
2280    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2281    for it.  The different analyses will record information in the
2282    loop_vec_info struct.  If ORIG_LOOP_VINFO is not NULL epilogue must
2283    be vectorized.  */
2284 loop_vec_info
2285 vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo,
2286                    vec_info_shared *shared)
2287 {
2288   loop_vec_info loop_vinfo;
2289   auto_vector_sizes vector_sizes;
2290
2291   /* Autodetect first vector size we try.  */
2292   current_vector_size = 0;
2293   targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
2294   unsigned int next_size = 0;
2295
2296   DUMP_VECT_SCOPE ("analyze_loop_nest");
2297
2298   if (loop_outer (loop)
2299       && loop_vec_info_for_loop (loop_outer (loop))
2300       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2301     {
2302       if (dump_enabled_p ())
2303         dump_printf_loc (MSG_NOTE, vect_location,
2304                          "outer-loop already vectorized.\n");
2305       return NULL;
2306     }
2307
2308   if (!find_loop_nest (loop, &shared->loop_nest))
2309     {
2310       if (dump_enabled_p ())
2311         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2312                          "not vectorized: loop nest containing two "
2313                          "or more consecutive inner loops cannot be "
2314                          "vectorized\n");
2315       return NULL;
2316     }
2317
2318   unsigned n_stmts;
2319   poly_uint64 autodetected_vector_size = 0;
2320   while (1)
2321     {
2322       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
2323       loop_vinfo = vect_analyze_loop_form (loop, shared);
2324       if (!loop_vinfo)
2325         {
2326           if (dump_enabled_p ())
2327             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2328                              "bad loop form.\n");
2329           return NULL;
2330         }
2331
2332       bool fatal = false;
2333
2334       if (orig_loop_vinfo)
2335         LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo;
2336
2337       if (vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts))
2338         {
2339           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2340
2341           return loop_vinfo;
2342         }
2343
2344       delete loop_vinfo;
2345
2346       if (next_size == 0)
2347         autodetected_vector_size = current_vector_size;
2348
2349       if (next_size < vector_sizes.length ()
2350           && known_eq (vector_sizes[next_size], autodetected_vector_size))
2351         next_size += 1;
2352
2353       if (fatal
2354           || next_size == vector_sizes.length ()
2355           || known_eq (current_vector_size, 0U))
2356         return NULL;
2357
2358       /* Try the next biggest vector size.  */
2359       current_vector_size = vector_sizes[next_size++];
2360       if (dump_enabled_p ())
2361         {
2362           dump_printf_loc (MSG_NOTE, vect_location,
2363                            "***** Re-trying analysis with "
2364                            "vector size ");
2365           dump_dec (MSG_NOTE, current_vector_size);
2366           dump_printf (MSG_NOTE, "\n");
2367         }
2368     }
2369 }
2370
2371 /* Return true if there is an in-order reduction function for CODE, storing
2372    it in *REDUC_FN if so.  */
2373
2374 static bool
2375 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
2376 {
2377   switch (code)
2378     {
2379     case PLUS_EXPR:
2380       *reduc_fn = IFN_FOLD_LEFT_PLUS;
2381       return true;
2382
2383     default:
2384       return false;
2385     }
2386 }
2387
2388 /* Function reduction_fn_for_scalar_code
2389
2390    Input:
2391    CODE - tree_code of a reduction operations.
2392
2393    Output:
2394    REDUC_FN - the corresponding internal function to be used to reduce the
2395       vector of partial results into a single scalar result, or IFN_LAST
2396       if the operation is a supported reduction operation, but does not have
2397       such an internal function.
2398
2399    Return FALSE if CODE currently cannot be vectorized as reduction.  */
2400
2401 static bool
2402 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
2403 {
2404   switch (code)
2405     {
2406       case MAX_EXPR:
2407         *reduc_fn = IFN_REDUC_MAX;
2408         return true;
2409
2410       case MIN_EXPR:
2411         *reduc_fn = IFN_REDUC_MIN;
2412         return true;
2413
2414       case PLUS_EXPR:
2415         *reduc_fn = IFN_REDUC_PLUS;
2416         return true;
2417
2418       case BIT_AND_EXPR:
2419         *reduc_fn = IFN_REDUC_AND;
2420         return true;
2421
2422       case BIT_IOR_EXPR:
2423         *reduc_fn = IFN_REDUC_IOR;
2424         return true;
2425
2426       case BIT_XOR_EXPR:
2427         *reduc_fn = IFN_REDUC_XOR;
2428         return true;
2429
2430       case MULT_EXPR:
2431       case MINUS_EXPR:
2432         *reduc_fn = IFN_LAST;
2433         return true;
2434
2435       default:
2436        return false;
2437     }
2438 }
2439
2440 /* If there is a neutral value X such that SLP reduction NODE would not
2441    be affected by the introduction of additional X elements, return that X,
2442    otherwise return null.  CODE is the code of the reduction.  REDUC_CHAIN
2443    is true if the SLP statements perform a single reduction, false if each
2444    statement performs an independent reduction.  */
2445
2446 static tree
2447 neutral_op_for_slp_reduction (slp_tree slp_node, tree_code code,
2448                               bool reduc_chain)
2449 {
2450   vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2451   gimple *stmt = stmts[0];
2452   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2453   tree vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
2454   tree scalar_type = TREE_TYPE (vector_type);
2455   struct loop *loop = gimple_bb (stmt)->loop_father;
2456   gcc_assert (loop);
2457
2458   switch (code)
2459     {
2460     case WIDEN_SUM_EXPR:
2461     case DOT_PROD_EXPR:
2462     case SAD_EXPR:
2463     case PLUS_EXPR:
2464     case MINUS_EXPR:
2465     case BIT_IOR_EXPR:
2466     case BIT_XOR_EXPR:
2467       return build_zero_cst (scalar_type);
2468
2469     case MULT_EXPR:
2470       return build_one_cst (scalar_type);
2471
2472     case BIT_AND_EXPR:
2473       return build_all_ones_cst (scalar_type);
2474
2475     case MAX_EXPR:
2476     case MIN_EXPR:
2477       /* For MIN/MAX the initial values are neutral.  A reduction chain
2478          has only a single initial value, so that value is neutral for
2479          all statements.  */
2480       if (reduc_chain)
2481         return PHI_ARG_DEF_FROM_EDGE (stmt, loop_preheader_edge (loop));
2482       return NULL_TREE;
2483
2484     default:
2485       return NULL_TREE;
2486     }
2487 }
2488
2489 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
2490    STMT is printed with a message MSG. */
2491
2492 static void
2493 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2494 {
2495   dump_printf_loc (msg_type, vect_location, "%s", msg);
2496   dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
2497 }
2498
2499
2500 /* Detect SLP reduction of the form:
2501
2502    #a1 = phi <a5, a0>
2503    a2 = operation (a1)
2504    a3 = operation (a2)
2505    a4 = operation (a3)
2506    a5 = operation (a4)
2507
2508    #a = phi <a5>
2509
2510    PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2511    FIRST_STMT is the first reduction stmt in the chain
2512    (a2 = operation (a1)).
2513
2514    Return TRUE if a reduction chain was detected.  */
2515
2516 static bool
2517 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2518                        gimple *first_stmt)
2519 {
2520   struct loop *loop = (gimple_bb (phi))->loop_father;
2521   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2522   enum tree_code code;
2523   gimple *current_stmt = NULL, *loop_use_stmt = NULL, *first, *next_stmt;
2524   stmt_vec_info use_stmt_info, current_stmt_info;
2525   tree lhs;
2526   imm_use_iterator imm_iter;
2527   use_operand_p use_p;
2528   int nloop_uses, size = 0, n_out_of_loop_uses;
2529   bool found = false;
2530
2531   if (loop != vect_loop)
2532     return false;
2533
2534   lhs = PHI_RESULT (phi);
2535   code = gimple_assign_rhs_code (first_stmt);
2536   while (1)
2537     {
2538       nloop_uses = 0;
2539       n_out_of_loop_uses = 0;
2540       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2541         {
2542           gimple *use_stmt = USE_STMT (use_p);
2543           if (is_gimple_debug (use_stmt))
2544             continue;
2545
2546           /* Check if we got back to the reduction phi.  */
2547           if (use_stmt == phi)
2548             {
2549               loop_use_stmt = use_stmt;
2550               found = true;
2551               break;
2552             }
2553
2554           if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2555             {
2556               loop_use_stmt = use_stmt;
2557               nloop_uses++;
2558             }
2559            else
2560              n_out_of_loop_uses++;
2561
2562            /* There are can be either a single use in the loop or two uses in
2563               phi nodes.  */
2564            if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2565              return false;
2566         }
2567
2568       if (found)
2569         break;
2570
2571       /* We reached a statement with no loop uses.  */
2572       if (nloop_uses == 0)
2573         return false;
2574
2575       /* This is a loop exit phi, and we haven't reached the reduction phi.  */
2576       if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2577         return false;
2578
2579       if (!is_gimple_assign (loop_use_stmt)
2580           || code != gimple_assign_rhs_code (loop_use_stmt)
2581           || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2582         return false;
2583
2584       /* Insert USE_STMT into reduction chain.  */
2585       use_stmt_info = vinfo_for_stmt (loop_use_stmt);
2586       if (current_stmt)
2587         {
2588           current_stmt_info = vinfo_for_stmt (current_stmt);
2589           REDUC_GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
2590           REDUC_GROUP_FIRST_ELEMENT (use_stmt_info)
2591             = REDUC_GROUP_FIRST_ELEMENT (current_stmt_info);
2592         }
2593       else
2594         REDUC_GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
2595
2596       lhs = gimple_assign_lhs (loop_use_stmt);
2597       current_stmt = loop_use_stmt;
2598       size++;
2599    }
2600
2601   if (!found || loop_use_stmt != phi || size < 2)
2602     return false;
2603
2604   /* Swap the operands, if needed, to make the reduction operand be the second
2605      operand.  */
2606   lhs = PHI_RESULT (phi);
2607   next_stmt = REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2608   while (next_stmt)
2609     {
2610       if (gimple_assign_rhs2 (next_stmt) == lhs)
2611         {
2612           tree op = gimple_assign_rhs1 (next_stmt);
2613           gimple *def_stmt = NULL;
2614
2615           if (TREE_CODE (op) == SSA_NAME)
2616             def_stmt = SSA_NAME_DEF_STMT (op);
2617
2618           /* Check that the other def is either defined in the loop
2619              ("vect_internal_def"), or it's an induction (defined by a
2620              loop-header phi-node).  */
2621           if (def_stmt
2622               && gimple_bb (def_stmt)
2623               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2624               && (is_gimple_assign (def_stmt)
2625                   || is_gimple_call (def_stmt)
2626                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2627                            == vect_induction_def
2628                   || (gimple_code (def_stmt) == GIMPLE_PHI
2629                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2630                                   == vect_internal_def
2631                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2632             {
2633               lhs = gimple_assign_lhs (next_stmt);
2634               next_stmt = REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2635               continue;
2636             }
2637
2638           return false;
2639         }
2640       else
2641         {
2642           tree op = gimple_assign_rhs2 (next_stmt);
2643           gimple *def_stmt = NULL;
2644
2645           if (TREE_CODE (op) == SSA_NAME)
2646             def_stmt = SSA_NAME_DEF_STMT (op);
2647
2648           /* Check that the other def is either defined in the loop
2649             ("vect_internal_def"), or it's an induction (defined by a
2650             loop-header phi-node).  */
2651           if (def_stmt
2652               && gimple_bb (def_stmt)
2653               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2654               && (is_gimple_assign (def_stmt)
2655                   || is_gimple_call (def_stmt)
2656                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2657                               == vect_induction_def
2658                   || (gimple_code (def_stmt) == GIMPLE_PHI
2659                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2660                                   == vect_internal_def
2661                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2662             {
2663               if (dump_enabled_p ())
2664                 {
2665                   dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2666                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2667                 }
2668
2669               swap_ssa_operands (next_stmt,
2670                                  gimple_assign_rhs1_ptr (next_stmt),
2671                                  gimple_assign_rhs2_ptr (next_stmt));
2672               update_stmt (next_stmt);
2673
2674               if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2675                 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2676             }
2677           else
2678             return false;
2679         }
2680
2681       lhs = gimple_assign_lhs (next_stmt);
2682       next_stmt = REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2683     }
2684
2685   /* Save the chain for further analysis in SLP detection.  */
2686   first = REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2687   LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2688   REDUC_GROUP_SIZE (vinfo_for_stmt (first)) = size;
2689
2690   return true;
2691 }
2692
2693 /* Return true if we need an in-order reduction for operation CODE
2694    on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
2695    overflow must wrap.  */
2696
2697 static bool
2698 needs_fold_left_reduction_p (tree type, tree_code code,
2699                              bool need_wrapping_integral_overflow)
2700 {
2701   /* CHECKME: check for !flag_finite_math_only too?  */
2702   if (SCALAR_FLOAT_TYPE_P (type))
2703     switch (code)
2704       {
2705       case MIN_EXPR:
2706       case MAX_EXPR:
2707         return false;
2708
2709       default:
2710         return !flag_associative_math;
2711       }
2712
2713   if (INTEGRAL_TYPE_P (type))
2714     {
2715       if (!operation_no_trapping_overflow (type, code))
2716         return true;
2717       if (need_wrapping_integral_overflow
2718           && !TYPE_OVERFLOW_WRAPS (type)
2719           && operation_can_overflow (code))
2720         return true;
2721       return false;
2722     }
2723
2724   if (SAT_FIXED_POINT_TYPE_P (type))
2725     return true;
2726
2727   return false;
2728 }
2729
2730 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
2731    reduction operation CODE has a handled computation expression.  */
2732
2733 bool
2734 check_reduction_path (location_t loc, loop_p loop, gphi *phi, tree loop_arg,
2735                       enum tree_code code)
2736 {
2737   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
2738   auto_bitmap visited;
2739   tree lookfor = PHI_RESULT (phi);
2740   ssa_op_iter curri;
2741   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
2742   while (USE_FROM_PTR (curr) != loop_arg)
2743     curr = op_iter_next_use (&curri);
2744   curri.i = curri.numops;
2745   do
2746     {
2747       path.safe_push (std::make_pair (curri, curr));
2748       tree use = USE_FROM_PTR (curr);
2749       if (use == lookfor)
2750         break;
2751       gimple *def = SSA_NAME_DEF_STMT (use);
2752       if (gimple_nop_p (def)
2753           || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
2754         {
2755 pop:
2756           do
2757             {
2758               std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
2759               curri = x.first;
2760               curr = x.second;
2761               do
2762                 curr = op_iter_next_use (&curri);
2763               /* Skip already visited or non-SSA operands (from iterating
2764                  over PHI args).  */
2765               while (curr != NULL_USE_OPERAND_P
2766                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2767                          || ! bitmap_set_bit (visited,
2768                                               SSA_NAME_VERSION
2769                                                 (USE_FROM_PTR (curr)))));
2770             }
2771           while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
2772           if (curr == NULL_USE_OPERAND_P)
2773             break;
2774         }
2775       else
2776         {
2777           if (gimple_code (def) == GIMPLE_PHI)
2778             curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
2779           else
2780             curr = op_iter_init_use (&curri, def, SSA_OP_USE);
2781           while (curr != NULL_USE_OPERAND_P
2782                  && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2783                      || ! bitmap_set_bit (visited,
2784                                           SSA_NAME_VERSION
2785                                             (USE_FROM_PTR (curr)))))
2786             curr = op_iter_next_use (&curri);
2787           if (curr == NULL_USE_OPERAND_P)
2788             goto pop;
2789         }
2790     }
2791   while (1);
2792   if (dump_file && (dump_flags & TDF_DETAILS))
2793     {
2794       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
2795       unsigned i;
2796       std::pair<ssa_op_iter, use_operand_p> *x;
2797       FOR_EACH_VEC_ELT (path, i, x)
2798         {
2799           dump_generic_expr (MSG_NOTE, TDF_SLIM, USE_FROM_PTR (x->second));
2800           dump_printf (MSG_NOTE, " ");
2801         }
2802       dump_printf (MSG_NOTE, "\n");
2803     }
2804
2805   /* Check whether the reduction path detected is valid.  */
2806   bool fail = path.length () == 0;
2807   bool neg = false;
2808   for (unsigned i = 1; i < path.length (); ++i)
2809     {
2810       gimple *use_stmt = USE_STMT (path[i].second);
2811       tree op = USE_FROM_PTR (path[i].second);
2812       if (! has_single_use (op)
2813           || ! is_gimple_assign (use_stmt))
2814         {
2815           fail = true;
2816           break;
2817         }
2818       if (gimple_assign_rhs_code (use_stmt) != code)
2819         {
2820           if (code == PLUS_EXPR
2821               && gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
2822             {
2823               /* Track whether we negate the reduction value each iteration.  */
2824               if (gimple_assign_rhs2 (use_stmt) == op)
2825                 neg = ! neg;
2826             }
2827           else
2828             {
2829               fail = true;
2830               break;
2831             }
2832         }
2833     }
2834   return ! fail && ! neg;
2835 }
2836
2837
2838 /* Function vect_is_simple_reduction
2839
2840    (1) Detect a cross-iteration def-use cycle that represents a simple
2841    reduction computation.  We look for the following pattern:
2842
2843    loop_header:
2844      a1 = phi < a0, a2 >
2845      a3 = ...
2846      a2 = operation (a3, a1)
2847
2848    or
2849
2850    a3 = ...
2851    loop_header:
2852      a1 = phi < a0, a2 >
2853      a2 = operation (a3, a1)
2854
2855    such that:
2856    1. operation is commutative and associative and it is safe to
2857       change the order of the computation
2858    2. no uses for a2 in the loop (a2 is used out of the loop)
2859    3. no uses of a1 in the loop besides the reduction operation
2860    4. no uses of a1 outside the loop.
2861
2862    Conditions 1,4 are tested here.
2863    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2864
2865    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2866    nested cycles.
2867
2868    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2869    reductions:
2870
2871      a1 = phi < a0, a2 >
2872      inner loop (def of a3)
2873      a2 = phi < a3 >
2874
2875    (4) Detect condition expressions, ie:
2876      for (int i = 0; i < N; i++)
2877        if (a[i] < val)
2878         ret_val = a[i];
2879
2880 */
2881
2882 static gimple *
2883 vect_is_simple_reduction (loop_vec_info loop_info, gimple *phi,
2884                           bool *double_reduc,
2885                           bool need_wrapping_integral_overflow,
2886                           enum vect_reduction_type *v_reduc_type)
2887 {
2888   struct loop *loop = (gimple_bb (phi))->loop_father;
2889   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2890   gimple *def_stmt, *def1 = NULL, *def2 = NULL, *phi_use_stmt = NULL;
2891   enum tree_code orig_code, code;
2892   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2893   tree type;
2894   int nloop_uses;
2895   tree name;
2896   imm_use_iterator imm_iter;
2897   use_operand_p use_p;
2898   bool phi_def;
2899
2900   *double_reduc = false;
2901   *v_reduc_type = TREE_CODE_REDUCTION;
2902
2903   tree phi_name = PHI_RESULT (phi);
2904   /* ???  If there are no uses of the PHI result the inner loop reduction
2905      won't be detected as possibly double-reduction by vectorizable_reduction
2906      because that tries to walk the PHI arg from the preheader edge which
2907      can be constant.  See PR60382.  */
2908   if (has_zero_uses (phi_name))
2909     return NULL;
2910   nloop_uses = 0;
2911   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
2912     {
2913       gimple *use_stmt = USE_STMT (use_p);
2914       if (is_gimple_debug (use_stmt))
2915         continue;
2916
2917       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2918         {
2919           if (dump_enabled_p ())
2920             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2921                              "intermediate value used outside loop.\n");
2922
2923           return NULL;
2924         }
2925
2926       nloop_uses++;
2927       if (nloop_uses > 1)
2928         {
2929           if (dump_enabled_p ())
2930             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2931                              "reduction value used in loop.\n");
2932           return NULL;
2933         }
2934
2935       phi_use_stmt = use_stmt;
2936     }
2937
2938   edge latch_e = loop_latch_edge (loop);
2939   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2940   if (TREE_CODE (loop_arg) != SSA_NAME)
2941     {
2942       if (dump_enabled_p ())
2943         {
2944           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2945                            "reduction: not ssa_name: ");
2946           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2947           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2948         }
2949       return NULL;
2950     }
2951
2952   def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2953   if (is_gimple_assign (def_stmt))
2954     {
2955       name = gimple_assign_lhs (def_stmt);
2956       phi_def = false;
2957     }
2958   else if (gimple_code (def_stmt) == GIMPLE_PHI)
2959     {
2960       name = PHI_RESULT (def_stmt);
2961       phi_def = true;
2962     }
2963   else
2964     {
2965       if (dump_enabled_p ())
2966         {
2967           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2968                            "reduction: unhandled reduction operation: ");
2969           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, def_stmt, 0);
2970         }
2971       return NULL;
2972     }
2973
2974   if (! flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)))
2975     return NULL;
2976
2977   nloop_uses = 0;
2978   auto_vec<gphi *, 3> lcphis;
2979   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2980     {
2981       gimple *use_stmt = USE_STMT (use_p);
2982       if (is_gimple_debug (use_stmt))
2983         continue;
2984       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2985         nloop_uses++;
2986       else
2987         /* We can have more than one loop-closed PHI.  */
2988         lcphis.safe_push (as_a <gphi *> (use_stmt));
2989       if (nloop_uses > 1)
2990         {
2991           if (dump_enabled_p ())
2992             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2993                              "reduction used in loop.\n");
2994           return NULL;
2995         }
2996     }
2997
2998   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2999      defined in the inner loop.  */
3000   if (phi_def)
3001     {
3002       op1 = PHI_ARG_DEF (def_stmt, 0);
3003
3004       if (gimple_phi_num_args (def_stmt) != 1
3005           || TREE_CODE (op1) != SSA_NAME)
3006         {
3007           if (dump_enabled_p ())
3008             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3009                              "unsupported phi node definition.\n");
3010
3011           return NULL;
3012         }
3013
3014       def1 = SSA_NAME_DEF_STMT (op1);
3015       if (gimple_bb (def1)
3016           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3017           && loop->inner
3018           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3019           && is_gimple_assign (def1)
3020           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3021         {
3022           if (dump_enabled_p ())
3023             report_vect_op (MSG_NOTE, def_stmt,
3024                             "detected double reduction: ");
3025
3026           *double_reduc = true;
3027           return def_stmt;
3028         }
3029
3030       return NULL;
3031     }
3032
3033   /* If we are vectorizing an inner reduction we are executing that
3034      in the original order only in case we are not dealing with a
3035      double reduction.  */
3036   bool check_reduction = true;
3037   if (flow_loop_nested_p (vect_loop, loop))
3038     {
3039       gphi *lcphi;
3040       unsigned i;
3041       check_reduction = false;
3042       FOR_EACH_VEC_ELT (lcphis, i, lcphi)
3043         FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result (lcphi))
3044           {
3045             gimple *use_stmt = USE_STMT (use_p);
3046             if (is_gimple_debug (use_stmt))
3047               continue;
3048             if (! flow_bb_inside_loop_p (vect_loop, gimple_bb (use_stmt)))
3049               check_reduction = true;
3050           }
3051     }
3052
3053   bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
3054   code = orig_code = gimple_assign_rhs_code (def_stmt);
3055
3056   /* We can handle "res -= x[i]", which is non-associative by
3057      simply rewriting this into "res += -x[i]".  Avoid changing
3058      gimple instruction for the first simple tests and only do this
3059      if we're allowed to change code at all.  */
3060   if (code == MINUS_EXPR && gimple_assign_rhs2 (def_stmt) != phi_name)
3061     code = PLUS_EXPR;
3062
3063   if (code == COND_EXPR)
3064     {
3065       if (! nested_in_vect_loop)
3066         *v_reduc_type = COND_REDUCTION;
3067
3068       op3 = gimple_assign_rhs1 (def_stmt);
3069       if (COMPARISON_CLASS_P (op3))
3070         {
3071           op4 = TREE_OPERAND (op3, 1);
3072           op3 = TREE_OPERAND (op3, 0);
3073         }
3074       if (op3 == phi_name || op4 == phi_name)
3075         {
3076           if (dump_enabled_p ())
3077             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3078                             "reduction: condition depends on previous"
3079                             " iteration: ");
3080           return NULL;
3081         }
3082
3083       op1 = gimple_assign_rhs2 (def_stmt);
3084       op2 = gimple_assign_rhs3 (def_stmt);
3085     }
3086   else if (!commutative_tree_code (code) || !associative_tree_code (code))
3087     {
3088       if (dump_enabled_p ())
3089         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3090                         "reduction: not commutative/associative: ");
3091       return NULL;
3092     }
3093   else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS)
3094     {
3095       op1 = gimple_assign_rhs1 (def_stmt);
3096       op2 = gimple_assign_rhs2 (def_stmt);
3097     }
3098   else
3099     {
3100       if (dump_enabled_p ())
3101         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3102                         "reduction: not handled operation: ");
3103       return NULL;
3104     }
3105
3106   if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
3107     {
3108       if (dump_enabled_p ())
3109         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3110                         "reduction: both uses not ssa_names: ");
3111
3112       return NULL;
3113     }
3114
3115   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
3116   if ((TREE_CODE (op1) == SSA_NAME
3117        && !types_compatible_p (type,TREE_TYPE (op1)))
3118       || (TREE_CODE (op2) == SSA_NAME
3119           && !types_compatible_p (type, TREE_TYPE (op2)))
3120       || (op3 && TREE_CODE (op3) == SSA_NAME
3121           && !types_compatible_p (type, TREE_TYPE (op3)))
3122       || (op4 && TREE_CODE (op4) == SSA_NAME
3123           && !types_compatible_p (type, TREE_TYPE (op4))))
3124     {
3125       if (dump_enabled_p ())
3126         {
3127           dump_printf_loc (MSG_NOTE, vect_location,
3128                            "reduction: multiple types: operation type: ");
3129           dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
3130           dump_printf (MSG_NOTE, ", operands types: ");
3131           dump_generic_expr (MSG_NOTE, TDF_SLIM,
3132                              TREE_TYPE (op1));
3133           dump_printf (MSG_NOTE, ",");
3134           dump_generic_expr (MSG_NOTE, TDF_SLIM,
3135                              TREE_TYPE (op2));
3136           if (op3)
3137             {
3138               dump_printf (MSG_NOTE, ",");
3139               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3140                                  TREE_TYPE (op3));
3141             }
3142
3143           if (op4)
3144             {
3145               dump_printf (MSG_NOTE, ",");
3146               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3147                                  TREE_TYPE (op4));
3148             }
3149           dump_printf (MSG_NOTE, "\n");
3150         }
3151
3152       return NULL;
3153     }
3154
3155   /* Check whether it's ok to change the order of the computation.
3156      Generally, when vectorizing a reduction we change the order of the
3157      computation.  This may change the behavior of the program in some
3158      cases, so we need to check that this is ok.  One exception is when
3159      vectorizing an outer-loop: the inner-loop is executed sequentially,
3160      and therefore vectorizing reductions in the inner-loop during
3161      outer-loop vectorization is safe.  */
3162   if (check_reduction
3163       && *v_reduc_type == TREE_CODE_REDUCTION
3164       && needs_fold_left_reduction_p (type, code,
3165                                       need_wrapping_integral_overflow))
3166     *v_reduc_type = FOLD_LEFT_REDUCTION;
3167
3168   /* Reduction is safe. We're dealing with one of the following:
3169      1) integer arithmetic and no trapv
3170      2) floating point arithmetic, and special flags permit this optimization
3171      3) nested cycle (i.e., outer loop vectorization).  */
3172   if (TREE_CODE (op1) == SSA_NAME)
3173     def1 = SSA_NAME_DEF_STMT (op1);
3174
3175   if (TREE_CODE (op2) == SSA_NAME)
3176     def2 = SSA_NAME_DEF_STMT (op2);
3177
3178   if (code != COND_EXPR
3179       && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
3180     {
3181       if (dump_enabled_p ())
3182         report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
3183       return NULL;
3184     }
3185
3186   /* Check that one def is the reduction def, defined by PHI,
3187      the other def is either defined in the loop ("vect_internal_def"),
3188      or it's an induction (defined by a loop-header phi-node).  */
3189
3190   if (def2 && def2 == phi
3191       && (code == COND_EXPR
3192           || !def1 || gimple_nop_p (def1)
3193           || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
3194           || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
3195               && (is_gimple_assign (def1)
3196                   || is_gimple_call (def1)
3197                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
3198                       == vect_induction_def
3199                   || (gimple_code (def1) == GIMPLE_PHI
3200                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
3201                           == vect_internal_def
3202                       && !is_loop_header_bb_p (gimple_bb (def1)))))))
3203     {
3204       if (dump_enabled_p ())
3205         report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3206       return def_stmt;
3207     }
3208
3209   if (def1 && def1 == phi
3210       && (code == COND_EXPR
3211           || !def2 || gimple_nop_p (def2)
3212           || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
3213           || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
3214               && (is_gimple_assign (def2)
3215                   || is_gimple_call (def2)
3216                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
3217                        == vect_induction_def
3218                   || (gimple_code (def2) == GIMPLE_PHI
3219                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
3220                            == vect_internal_def
3221                       && !is_loop_header_bb_p (gimple_bb (def2)))))))
3222     {
3223       if (! nested_in_vect_loop && orig_code != MINUS_EXPR)
3224         {
3225           /* Check if we can swap operands (just for simplicity - so that
3226              the rest of the code can assume that the reduction variable
3227              is always the last (second) argument).  */
3228           if (code == COND_EXPR)
3229             {
3230               /* Swap cond_expr by inverting the condition.  */
3231               tree cond_expr = gimple_assign_rhs1 (def_stmt);
3232               enum tree_code invert_code = ERROR_MARK;
3233               enum tree_code cond_code = TREE_CODE (cond_expr);
3234
3235               if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
3236                 {
3237                   bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
3238                   invert_code = invert_tree_comparison (cond_code, honor_nans);
3239                 }
3240               if (invert_code != ERROR_MARK)
3241                 {
3242                   TREE_SET_CODE (cond_expr, invert_code);
3243                   swap_ssa_operands (def_stmt,
3244                                      gimple_assign_rhs2_ptr (def_stmt),
3245                                      gimple_assign_rhs3_ptr (def_stmt));
3246                 }
3247               else
3248                 {
3249                   if (dump_enabled_p ())
3250                     report_vect_op (MSG_NOTE, def_stmt,
3251                                     "detected reduction: cannot swap operands "
3252                                     "for cond_expr");
3253                   return NULL;
3254                 }
3255             }
3256           else
3257             swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
3258                                gimple_assign_rhs2_ptr (def_stmt));
3259
3260           if (dump_enabled_p ())
3261             report_vect_op (MSG_NOTE, def_stmt,
3262                             "detected reduction: need to swap operands: ");
3263
3264           if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
3265             LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
3266         }
3267       else
3268         {
3269           if (dump_enabled_p ())
3270             report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3271         }
3272
3273       return def_stmt;
3274     }
3275
3276   /* Try to find SLP reduction chain.  */
3277   if (! nested_in_vect_loop
3278       && code != COND_EXPR
3279       && orig_code != MINUS_EXPR
3280       && vect_is_slp_reduction (loop_info, phi, def_stmt))
3281     {
3282       if (dump_enabled_p ())
3283         report_vect_op (MSG_NOTE, def_stmt,
3284                         "reduction: detected reduction chain: ");
3285
3286       return def_stmt;
3287     }
3288
3289   /* Dissolve group eventually half-built by vect_is_slp_reduction.  */
3290   gimple *first = REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (def_stmt));
3291   while (first)
3292     {
3293       gimple *next = REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
3294       REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (first)) = NULL;
3295       REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (first)) = NULL;
3296       first = next;
3297     }
3298
3299   /* Look for the expression computing loop_arg from loop PHI result.  */
3300   if (check_reduction_path (vect_location, loop, as_a <gphi *> (phi), loop_arg,
3301                             code))
3302     return def_stmt;
3303
3304   if (dump_enabled_p ())
3305     {
3306       report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3307                       "reduction: unknown pattern: ");
3308     }
3309
3310   return NULL;
3311 }
3312
3313 /* Wrapper around vect_is_simple_reduction, which will modify code
3314    in-place if it enables detection of more reductions.  Arguments
3315    as there.  */
3316
3317 gimple *
3318 vect_force_simple_reduction (loop_vec_info loop_info, gimple *phi,
3319                              bool *double_reduc,
3320                              bool need_wrapping_integral_overflow)
3321 {
3322   enum vect_reduction_type v_reduc_type;
3323   gimple *def = vect_is_simple_reduction (loop_info, phi, double_reduc,
3324                                           need_wrapping_integral_overflow,
3325                                           &v_reduc_type);
3326   if (def)
3327     {
3328       stmt_vec_info reduc_def_info = vinfo_for_stmt (phi);
3329       STMT_VINFO_REDUC_TYPE (reduc_def_info) = v_reduc_type;
3330       STMT_VINFO_REDUC_DEF (reduc_def_info) = def;
3331       reduc_def_info = vinfo_for_stmt (def);
3332       STMT_VINFO_REDUC_TYPE (reduc_def_info) = v_reduc_type;
3333       STMT_VINFO_REDUC_DEF (reduc_def_info) = phi;
3334     }
3335   return def;
3336 }
3337
3338 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
3339 int
3340 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3341                              int *peel_iters_epilogue,
3342                              stmt_vector_for_cost *scalar_cost_vec,
3343                              stmt_vector_for_cost *prologue_cost_vec,
3344                              stmt_vector_for_cost *epilogue_cost_vec)
3345 {
3346   int retval = 0;
3347   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3348
3349   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3350     {
3351       *peel_iters_epilogue = assumed_vf / 2;
3352       if (dump_enabled_p ())
3353         dump_printf_loc (MSG_NOTE, vect_location,
3354                          "cost model: epilogue peel iters set to vf/2 "
3355                          "because loop iterations are unknown .\n");
3356
3357       /* If peeled iterations are known but number of scalar loop
3358          iterations are unknown, count a taken branch per peeled loop.  */
3359       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3360                                  NULL, 0, vect_prologue);
3361       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3362                                  NULL, 0, vect_epilogue);
3363     }
3364   else
3365     {
3366       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3367       peel_iters_prologue = niters < peel_iters_prologue ?
3368                             niters : peel_iters_prologue;
3369       *peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3370       /* If we need to peel for gaps, but no peeling is required, we have to
3371          peel VF iterations.  */
3372       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3373         *peel_iters_epilogue = assumed_vf;
3374     }
3375
3376   stmt_info_for_cost *si;
3377   int j;
3378   if (peel_iters_prologue)
3379     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3380         {
3381           stmt_vec_info stmt_info
3382             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3383           retval += record_stmt_cost (prologue_cost_vec,
3384                                       si->count * peel_iters_prologue,
3385                                       si->kind, stmt_info, si->misalign,
3386                                       vect_prologue);
3387         }
3388   if (*peel_iters_epilogue)
3389     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3390         {
3391           stmt_vec_info stmt_info
3392             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3393           retval += record_stmt_cost (epilogue_cost_vec,
3394                                       si->count * *peel_iters_epilogue,
3395                                       si->kind, stmt_info, si->misalign,
3396                                       vect_epilogue);
3397         }
3398
3399   return retval;
3400 }
3401
3402 /* Function vect_estimate_min_profitable_iters
3403
3404    Return the number of iterations required for the vector version of the
3405    loop to be profitable relative to the cost of the scalar version of the
3406    loop.
3407
3408    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3409    of iterations for vectorization.  -1 value means loop vectorization
3410    is not profitable.  This returned value may be used for dynamic
3411    profitability check.
3412
3413    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3414    for static check against estimated number of iterations.  */
3415
3416 static void
3417 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3418                                     int *ret_min_profitable_niters,
3419                                     int *ret_min_profitable_estimate)
3420 {
3421   int min_profitable_iters;
3422   int min_profitable_estimate;
3423   int peel_iters_prologue;
3424   int peel_iters_epilogue;
3425   unsigned vec_inside_cost = 0;
3426   int vec_outside_cost = 0;
3427   unsigned vec_prologue_cost = 0;
3428   unsigned vec_epilogue_cost = 0;
3429   int scalar_single_iter_cost = 0;
3430   int scalar_outside_cost = 0;
3431   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3432   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3433   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3434
3435   /* Cost model disabled.  */
3436   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3437     {
3438       dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3439       *ret_min_profitable_niters = 0;
3440       *ret_min_profitable_estimate = 0;
3441       return;
3442     }
3443
3444   /* Requires loop versioning tests to handle misalignment.  */
3445   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3446     {
3447       /*  FIXME: Make cost depend on complexity of individual check.  */
3448       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3449       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3450                             vect_prologue);
3451       dump_printf (MSG_NOTE,
3452                    "cost model: Adding cost of checks for loop "
3453                    "versioning to treat misalignment.\n");
3454     }
3455
3456   /* Requires loop versioning with alias checks.  */
3457   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3458     {
3459       /*  FIXME: Make cost depend on complexity of individual check.  */
3460       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3461       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3462                             vect_prologue);
3463       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3464       if (len)
3465         /* Count LEN - 1 ANDs and LEN comparisons.  */
3466         (void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt,
3467                               NULL, 0, vect_prologue);
3468       len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3469       if (len)
3470         {
3471           /* Count LEN - 1 ANDs and LEN comparisons.  */
3472           unsigned int nstmts = len * 2 - 1;
3473           /* +1 for each bias that needs adding.  */
3474           for (unsigned int i = 0; i < len; ++i)
3475             if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3476               nstmts += 1;
3477           (void) add_stmt_cost (target_cost_data, nstmts, scalar_stmt,
3478                                 NULL, 0, vect_prologue);
3479         }
3480       dump_printf (MSG_NOTE,
3481                    "cost model: Adding cost of checks for loop "
3482                    "versioning aliasing.\n");
3483     }
3484
3485   /* Requires loop versioning with niter checks.  */
3486   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3487     {
3488       /*  FIXME: Make cost depend on complexity of individual check.  */
3489       (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3490                             vect_prologue);
3491       dump_printf (MSG_NOTE,
3492                    "cost model: Adding cost of checks for loop "
3493                    "versioning niters.\n");
3494     }
3495
3496   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3497     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3498                           vect_prologue);
3499
3500   /* Count statements in scalar loop.  Using this as scalar cost for a single
3501      iteration for now.
3502
3503      TODO: Add outer loop support.
3504
3505      TODO: Consider assigning different costs to different scalar
3506      statements.  */
3507
3508   scalar_single_iter_cost
3509     = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3510
3511   /* Add additional cost for the peeled instructions in prologue and epilogue
3512      loop.  (For fully-masked loops there will be no peeling.)
3513
3514      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3515      at compile-time - we assume it's vf/2 (the worst would be vf-1).
3516
3517      TODO: Build an expression that represents peel_iters for prologue and
3518      epilogue to be used in a run-time test.  */
3519
3520   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3521     {
3522       peel_iters_prologue = 0;
3523       peel_iters_epilogue = 0;
3524
3525       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3526         {
3527           /* We need to peel exactly one iteration.  */
3528           peel_iters_epilogue += 1;
3529           stmt_info_for_cost *si;
3530           int j;
3531           FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
3532                             j, si)
3533             {
3534               struct _stmt_vec_info *stmt_info
3535                 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3536               (void) add_stmt_cost (target_cost_data, si->count,
3537                                     si->kind, stmt_info, si->misalign,
3538                                     vect_epilogue);
3539             }
3540         }
3541     }
3542   else if (npeel < 0)
3543     {
3544       peel_iters_prologue = assumed_vf / 2;
3545       dump_printf (MSG_NOTE, "cost model: "
3546                    "prologue peel iters set to vf/2.\n");
3547
3548       /* If peeling for alignment is unknown, loop bound of main loop becomes
3549          unknown.  */
3550       peel_iters_epilogue = assumed_vf / 2;
3551       dump_printf (MSG_NOTE, "cost model: "
3552                    "epilogue peel iters set to vf/2 because "
3553                    "peeling for alignment is unknown.\n");
3554
3555       /* If peeled iterations are unknown, count a taken branch and a not taken
3556          branch per peeled loop. Even if scalar loop iterations are known,
3557          vector iterations are not known since peeled prologue iterations are
3558          not known. Hence guards remain the same.  */
3559       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3560                             NULL, 0, vect_prologue);
3561       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3562                             NULL, 0, vect_prologue);
3563       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3564                             NULL, 0, vect_epilogue);
3565       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3566                             NULL, 0, vect_epilogue);
3567       stmt_info_for_cost *si;
3568       int j;
3569       FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3570         {
3571           struct _stmt_vec_info *stmt_info
3572             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3573           (void) add_stmt_cost (target_cost_data,
3574                                 si->count * peel_iters_prologue,
3575                                 si->kind, stmt_info, si->misalign,
3576                                 vect_prologue);
3577           (void) add_stmt_cost (target_cost_data,
3578                                 si->count * peel_iters_epilogue,
3579                                 si->kind, stmt_info, si->misalign,
3580                                 vect_epilogue);
3581         }
3582     }
3583   else
3584     {
3585       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3586       stmt_info_for_cost *si;
3587       int j;
3588       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3589
3590       prologue_cost_vec.create (2);
3591       epilogue_cost_vec.create (2);
3592       peel_iters_prologue = npeel;
3593
3594       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3595                                           &peel_iters_epilogue,
3596                                           &LOOP_VINFO_SCALAR_ITERATION_COST
3597                                             (loop_vinfo),
3598                                           &prologue_cost_vec,
3599                                           &epilogue_cost_vec);
3600
3601       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3602         {
3603           struct _stmt_vec_info *stmt_info
3604             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3605           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3606                                 si->misalign, vect_prologue);
3607         }
3608
3609       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3610         {
3611           struct _stmt_vec_info *stmt_info
3612             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3613           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3614                                 si->misalign, vect_epilogue);
3615         }
3616
3617       prologue_cost_vec.release ();
3618       epilogue_cost_vec.release ();
3619     }
3620
3621   /* FORNOW: The scalar outside cost is incremented in one of the
3622      following ways:
3623
3624      1. The vectorizer checks for alignment and aliasing and generates
3625      a condition that allows dynamic vectorization.  A cost model
3626      check is ANDED with the versioning condition.  Hence scalar code
3627      path now has the added cost of the versioning check.
3628
3629        if (cost > th & versioning_check)
3630          jmp to vector code
3631
3632      Hence run-time scalar is incremented by not-taken branch cost.
3633
3634      2. The vectorizer then checks if a prologue is required.  If the
3635      cost model check was not done before during versioning, it has to
3636      be done before the prologue check.
3637
3638        if (cost <= th)
3639          prologue = scalar_iters
3640        if (prologue == 0)
3641          jmp to vector code
3642        else
3643          execute prologue
3644        if (prologue == num_iters)
3645          go to exit
3646
3647      Hence the run-time scalar cost is incremented by a taken branch,
3648      plus a not-taken branch, plus a taken branch cost.
3649
3650      3. The vectorizer then checks if an epilogue is required.  If the
3651      cost model check was not done before during prologue check, it
3652      has to be done with the epilogue check.
3653
3654        if (prologue == 0)
3655          jmp to vector code
3656        else
3657          execute prologue
3658        if (prologue == num_iters)
3659          go to exit
3660        vector code:
3661          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3662            jmp to epilogue
3663
3664      Hence the run-time scalar cost should be incremented by 2 taken
3665      branches.
3666
3667      TODO: The back end may reorder the BBS's differently and reverse
3668      conditions/branch directions.  Change the estimates below to
3669      something more reasonable.  */
3670
3671   /* If the number of iterations is known and we do not do versioning, we can
3672      decide whether to vectorize at compile time.  Hence the scalar version
3673      do not carry cost model guard costs.  */
3674   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3675       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3676     {
3677       /* Cost model check occurs at versioning.  */
3678       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3679         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3680       else
3681         {
3682           /* Cost model check occurs at prologue generation.  */
3683           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3684             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3685               + vect_get_stmt_cost (cond_branch_not_taken);
3686           /* Cost model check occurs at epilogue generation.  */
3687           else
3688             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3689         }
3690     }
3691
3692   /* Complete the target-specific cost calculations.  */
3693   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3694                &vec_inside_cost, &vec_epilogue_cost);
3695
3696   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3697
3698   if (dump_enabled_p ())
3699     {
3700       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3701       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
3702                    vec_inside_cost);
3703       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
3704                    vec_prologue_cost);
3705       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
3706                    vec_epilogue_cost);
3707       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
3708                    scalar_single_iter_cost);
3709       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
3710                    scalar_outside_cost);
3711       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
3712                    vec_outside_cost);
3713       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
3714                    peel_iters_prologue);
3715       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
3716                    peel_iters_epilogue);
3717     }
3718
3719   /* Calculate number of iterations required to make the vector version
3720      profitable, relative to the loop bodies only.  The following condition
3721      must hold true:
3722      SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
3723      where
3724      SIC = scalar iteration cost, VIC = vector iteration cost,
3725      VOC = vector outside cost, VF = vectorization factor,
3726      PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
3727      SOC = scalar outside cost for run time cost model check.  */
3728
3729   if ((scalar_single_iter_cost * assumed_vf) > (int) vec_inside_cost)
3730     {
3731       min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
3732                               * assumed_vf
3733                               - vec_inside_cost * peel_iters_prologue
3734                               - vec_inside_cost * peel_iters_epilogue);
3735       if (min_profitable_iters <= 0)
3736         min_profitable_iters = 0;
3737       else
3738         {
3739           min_profitable_iters /= ((scalar_single_iter_cost * assumed_vf)
3740                                    - vec_inside_cost);
3741
3742           if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
3743               <= (((int) vec_inside_cost * min_profitable_iters)
3744                   + (((int) vec_outside_cost - scalar_outside_cost)
3745                      * assumed_vf)))
3746             min_profitable_iters++;
3747         }
3748     }
3749   /* vector version will never be profitable.  */
3750   else
3751     {
3752       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3753         warning_at (vect_location, OPT_Wopenmp_simd, "vectorization "
3754                     "did not happen for a simd loop");
3755
3756       if (dump_enabled_p ())
3757         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3758                          "cost model: the vector iteration cost = %d "
3759                          "divided by the scalar iteration cost = %d "
3760                          "is greater or equal to the vectorization factor = %d"
3761                          ".\n",
3762                          vec_inside_cost, scalar_single_iter_cost, assumed_vf);
3763       *ret_min_profitable_niters = -1;
3764       *ret_min_profitable_estimate = -1;
3765       return;
3766     }
3767
3768   dump_printf (MSG_NOTE,
3769                "  Calculated minimum iters for profitability: %d\n",
3770                min_profitable_iters);
3771
3772   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
3773       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
3774     /* We want the vectorized loop to execute at least once.  */
3775     min_profitable_iters = assumed_vf + peel_iters_prologue;
3776
3777   if (dump_enabled_p ())
3778     dump_printf_loc (MSG_NOTE, vect_location,
3779                      "  Runtime profitability threshold = %d\n",
3780                      min_profitable_iters);
3781
3782   *ret_min_profitable_niters = min_profitable_iters;
3783
3784   /* Calculate number of iterations required to make the vector version
3785      profitable, relative to the loop bodies only.
3786
3787      Non-vectorized variant is SIC * niters and it must win over vector
3788      variant on the expected loop trip count.  The following condition must hold true:
3789      SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC  */
3790
3791   if (vec_outside_cost <= 0)
3792     min_profitable_estimate = 0;
3793   else
3794     {
3795       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
3796                                  * assumed_vf
3797                                  - vec_inside_cost * peel_iters_prologue
3798                                  - vec_inside_cost * peel_iters_epilogue)
3799                                  / ((scalar_single_iter_cost * assumed_vf)
3800                                    - vec_inside_cost);
3801     }
3802   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3803   if (dump_enabled_p ())
3804     dump_printf_loc (MSG_NOTE, vect_location,
3805                      "  Static estimate profitability threshold = %d\n",
3806                      min_profitable_estimate);
3807
3808   *ret_min_profitable_estimate = min_profitable_estimate;
3809 }
3810
3811 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3812    vector elements (not bits) for a vector with NELT elements.  */
3813 static void
3814 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
3815                               vec_perm_builder *sel)
3816 {
3817   /* The encoding is a single stepped pattern.  Any wrap-around is handled
3818      by vec_perm_indices.  */
3819   sel->new_vector (nelt, 1, 3);
3820   for (unsigned int i = 0; i < 3; i++)
3821     sel->quick_push (i + offset);
3822 }
3823
3824 /* Checks whether the target supports whole-vector shifts for vectors of mode
3825    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
3826    it supports vec_perm_const with masks for all necessary shift amounts.  */
3827 static bool
3828 have_whole_vector_shift (machine_mode mode)
3829 {
3830   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3831     return true;
3832
3833   /* Variable-length vectors should be handled via the optab.  */
3834   unsigned int nelt;
3835   if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
3836     return false;
3837
3838   vec_perm_builder sel;
3839   vec_perm_indices indices;
3840   for (unsigned int i = nelt / 2; i >= 1; i /= 2)
3841     {
3842       calc_vec_perm_mask_for_shift (i, nelt, &sel);
3843       indices.new_vector (sel, 2, nelt);
3844       if (!can_vec_perm_const_p (mode, indices, false))
3845         return false;
3846     }
3847   return true;
3848 }
3849
3850 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3851    functions. Design better to avoid maintenance issues.  */
3852
3853 /* Function vect_model_reduction_cost.
3854
3855    Models cost for a reduction operation, including the vector ops
3856    generated within the strip-mine loop, the initial definition before
3857    the loop, and the epilogue code that must be generated.  */
3858
3859 static void
3860 vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
3861                            int ncopies, stmt_vector_for_cost *cost_vec)
3862 {
3863   int prologue_cost = 0, epilogue_cost = 0, inside_cost;
3864   enum tree_code code;
3865   optab optab;
3866   tree vectype;
3867   gimple *orig_stmt;
3868   machine_mode mode;
3869   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3870   struct loop *loop = NULL;
3871
3872   if (loop_vinfo)
3873     loop = LOOP_VINFO_LOOP (loop_vinfo);
3874
3875   /* Condition reductions generate two reductions in the loop.  */
3876   vect_reduction_type reduction_type
3877     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
3878   if (reduction_type == COND_REDUCTION)
3879     ncopies *= 2;
3880
3881   vectype = STMT_VINFO_VECTYPE (stmt_info);
3882   mode = TYPE_MODE (vectype);
3883   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3884
3885   if (!orig_stmt)
3886     orig_stmt = STMT_VINFO_STMT (stmt_info);
3887
3888   code = gimple_assign_rhs_code (orig_stmt);
3889
3890   if (reduction_type == EXTRACT_LAST_REDUCTION
3891       || reduction_type == FOLD_LEFT_REDUCTION)
3892     {
3893       /* No extra instructions needed in the prologue.  */
3894       prologue_cost = 0;
3895
3896       if (reduction_type == EXTRACT_LAST_REDUCTION || reduc_fn != IFN_LAST)
3897         /* Count one reduction-like operation per vector.  */
3898         inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
3899                                         stmt_info, 0, vect_body);
3900       else
3901         {
3902           /* Use NELEMENTS extracts and NELEMENTS scalar ops.  */
3903           unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
3904           inside_cost = record_stmt_cost (cost_vec, nelements,
3905                                           vec_to_scalar, stmt_info, 0,
3906                                           vect_body);
3907           inside_cost += record_stmt_cost (cost_vec, nelements,
3908                                            scalar_stmt, stmt_info, 0,
3909                                            vect_body);
3910         }
3911     }
3912   else
3913     {
3914       /* Add in cost for initial definition.
3915          For cond reduction we have four vectors: initial index, step,
3916          initial result of the data reduction, initial value of the index
3917          reduction.  */
3918       int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
3919       prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
3920                                          scalar_to_vec, stmt_info, 0,
3921                                          vect_prologue);
3922
3923       /* Cost of reduction op inside loop.  */
3924       inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
3925                                       stmt_info, 0, vect_body);
3926     }
3927
3928   /* Determine cost of epilogue code.
3929
3930      We have a reduction operator that will reduce the vector in one statement.
3931      Also requires scalar extract.  */
3932
3933   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt))
3934     {
3935       if (reduc_fn != IFN_LAST)
3936         {
3937           if (reduction_type == COND_REDUCTION)
3938             {
3939               /* An EQ stmt and an COND_EXPR stmt.  */
3940               epilogue_cost += record_stmt_cost (cost_vec, 2,
3941                                                  vector_stmt, stmt_info, 0,
3942                                                  vect_epilogue);
3943               /* Reduction of the max index and a reduction of the found
3944                  values.  */
3945               epilogue_cost += record_stmt_cost (cost_vec, 2,
3946                                                  vec_to_scalar, stmt_info, 0,
3947                                                  vect_epilogue);
3948               /* A broadcast of the max value.  */
3949               epilogue_cost += record_stmt_cost (cost_vec, 1,
3950                                                  scalar_to_vec, stmt_info, 0,
3951                                                  vect_epilogue);
3952             }
3953           else
3954             {
3955               epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
3956                                                  stmt_info, 0, vect_epilogue);
3957               epilogue_cost += record_stmt_cost (cost_vec, 1,
3958                                                  vec_to_scalar, stmt_info, 0,
3959                                                  vect_epilogue);
3960             }
3961         }
3962       else if (reduction_type == COND_REDUCTION)
3963         {
3964           unsigned estimated_nunits = vect_nunits_for_cost (vectype);
3965           /* Extraction of scalar elements.  */
3966           epilogue_cost += record_stmt_cost (cost_vec,
3967                                              2 * estimated_nunits,
3968                                              vec_to_scalar, stmt_info, 0,
3969                                              vect_epilogue);
3970           /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
3971           epilogue_cost += record_stmt_cost (cost_vec,
3972                                              2 * estimated_nunits - 3,
3973                                              scalar_stmt, stmt_info, 0,
3974                                              vect_epilogue);
3975         }
3976       else if (reduction_type == EXTRACT_LAST_REDUCTION
3977                || reduction_type == FOLD_LEFT_REDUCTION)
3978         /* No extra instructions need in the epilogue.  */
3979         ;
3980       else
3981         {
3982           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3983           tree bitsize =
3984             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
3985           int element_bitsize = tree_to_uhwi (bitsize);
3986           int nelements = vec_size_in_bits / element_bitsize;
3987
3988           if (code == COND_EXPR)
3989             code = MAX_EXPR;
3990
3991           optab = optab_for_tree_code (code, vectype, optab_default);
3992
3993           /* We have a whole vector shift available.  */
3994           if (optab != unknown_optab
3995               && VECTOR_MODE_P (mode)
3996               && optab_handler (optab, mode) != CODE_FOR_nothing
3997               && have_whole_vector_shift (mode))
3998             {
3999               /* Final reduction via vector shifts and the reduction operator.
4000                  Also requires scalar extract.  */
4001               epilogue_cost += record_stmt_cost (cost_vec,
4002                                                  exact_log2 (nelements) * 2,
4003                                                  vector_stmt, stmt_info, 0,
4004                                                  vect_epilogue);
4005               epilogue_cost += record_stmt_cost (cost_vec, 1,
4006                                                  vec_to_scalar, stmt_info, 0,
4007                                                  vect_epilogue);
4008             }
4009           else
4010             /* Use extracts and reduction op for final reduction.  For N
4011                elements, we have N extracts and N-1 reduction ops.  */
4012             epilogue_cost += record_stmt_cost (cost_vec,
4013                                                nelements + nelements - 1,
4014                                                vector_stmt, stmt_info, 0,
4015                                                vect_epilogue);
4016         }
4017     }
4018
4019   if (dump_enabled_p ())
4020     dump_printf (MSG_NOTE,
4021                  "vect_model_reduction_cost: inside_cost = %d, "
4022                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4023                  prologue_cost, epilogue_cost);
4024 }
4025
4026
4027 /* Function vect_model_induction_cost.
4028
4029    Models cost for induction operations.  */
4030
4031 static void
4032 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies,
4033                            stmt_vector_for_cost *cost_vec)
4034 {
4035   unsigned inside_cost, prologue_cost;
4036
4037   if (PURE_SLP_STMT (stmt_info))
4038     return;
4039
4040   /* loop cost for vec_loop.  */
4041   inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
4042                                   stmt_info, 0, vect_body);
4043
4044   /* prologue cost for vec_init and vec_step.  */
4045   prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
4046                                     stmt_info, 0, vect_prologue);
4047
4048   if (dump_enabled_p ())
4049     dump_printf_loc (MSG_NOTE, vect_location,
4050                      "vect_model_induction_cost: inside_cost = %d, "
4051                      "prologue_cost = %d .\n", inside_cost, prologue_cost);
4052 }
4053
4054
4055
4056 /* Function get_initial_def_for_reduction
4057
4058    Input:
4059    STMT - a stmt that performs a reduction operation in the loop.
4060    INIT_VAL - the initial value of the reduction variable
4061
4062    Output:
4063    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4064         of the reduction (used for adjusting the epilog - see below).
4065    Return a vector variable, initialized according to the operation that STMT
4066         performs. This vector will be used as the initial value of the
4067         vector of partial results.
4068
4069    Option1 (adjust in epilog): Initialize the vector as follows:
4070      add/bit or/xor:    [0,0,...,0,0]
4071      mult/bit and:      [1,1,...,1,1]
4072      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4073    and when necessary (e.g. add/mult case) let the caller know
4074    that it needs to adjust the result by init_val.
4075
4076    Option2: Initialize the vector as follows:
4077      add/bit or/xor:    [init_val,0,0,...,0]
4078      mult/bit and:      [init_val,1,1,...,1]
4079      min/max/cond_expr: [init_val,init_val,...,init_val]
4080    and no adjustments are needed.
4081
4082    For example, for the following code:
4083
4084    s = init_val;
4085    for (i=0;i<n;i++)
4086      s = s + a[i];
4087
4088    STMT is 's = s + a[i]', and the reduction variable is 's'.
4089    For a vector of 4 units, we want to return either [0,0,0,init_val],
4090    or [0,0,0,0] and let the caller know that it needs to adjust
4091    the result at the end by 'init_val'.
4092
4093    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4094    initialization vector is simpler (same element in all entries), if
4095    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4096
4097    A cost model should help decide between these two schemes.  */
4098
4099 tree
4100 get_initial_def_for_reduction (gimple *stmt, tree init_val,
4101                                tree *adjustment_def)
4102 {
4103   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4104   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
4105   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4106   tree scalar_type = TREE_TYPE (init_val);
4107   tree vectype = get_vectype_for_scalar_type (scalar_type);
4108   enum tree_code code = gimple_assign_rhs_code (stmt);
4109   tree def_for_init;
4110   tree init_def;
4111   bool nested_in_vect_loop = false;
4112   REAL_VALUE_TYPE real_init_val = dconst0;
4113   int int_init_val = 0;
4114   gimple *def_stmt = NULL;
4115   gimple_seq stmts = NULL;
4116
4117   gcc_assert (vectype);
4118
4119   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4120               || SCALAR_FLOAT_TYPE_P (scalar_type));
4121
4122   if (nested_in_vect_loop_p (loop, stmt))
4123     nested_in_vect_loop = true;
4124   else
4125     gcc_assert (loop == (gimple_bb (stmt))->loop_father);
4126
4127   /* In case of double reduction we only create a vector variable to be put
4128      in the reduction phi node.  The actual statement creation is done in
4129      vect_create_epilog_for_reduction.  */
4130   if (adjustment_def && nested_in_vect_loop
4131       && TREE_CODE (init_val) == SSA_NAME
4132       && (def_stmt = SSA_NAME_DEF_STMT (init_val))
4133       && gimple_code (def_stmt) == GIMPLE_PHI
4134       && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
4135       && vinfo_for_stmt (def_stmt)
4136       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
4137           == vect_double_reduction_def)
4138     {
4139       *adjustment_def = NULL;
4140       return vect_create_destination_var (init_val, vectype);
4141     }
4142
4143   vect_reduction_type reduction_type
4144     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo);
4145
4146   /* In case of a nested reduction do not use an adjustment def as
4147      that case is not supported by the epilogue generation correctly
4148      if ncopies is not one.  */
4149   if (adjustment_def && nested_in_vect_loop)
4150     {
4151       *adjustment_def = NULL;
4152       return vect_get_vec_def_for_operand (init_val, stmt);
4153     }
4154
4155   switch (code)
4156     {
4157     case WIDEN_SUM_EXPR:
4158     case DOT_PROD_EXPR:
4159     case SAD_EXPR:
4160     case PLUS_EXPR:
4161     case MINUS_EXPR:
4162     case BIT_IOR_EXPR:
4163     case BIT_XOR_EXPR:
4164     case MULT_EXPR:
4165     case BIT_AND_EXPR:
4166       {
4167         /* ADJUSTMENT_DEF is NULL when called from
4168            vect_create_epilog_for_reduction to vectorize double reduction.  */
4169         if (adjustment_def)
4170           *adjustment_def = init_val;
4171
4172         if (code == MULT_EXPR)
4173           {
4174             real_init_val = dconst1;
4175             int_init_val = 1;
4176           }
4177
4178         if (code == BIT_AND_EXPR)
4179           int_init_val = -1;
4180
4181         if (SCALAR_FLOAT_TYPE_P (scalar_type))
4182           def_for_init = build_real (scalar_type, real_init_val);
4183         else
4184           def_for_init = build_int_cst (scalar_type, int_init_val);
4185
4186         if (adjustment_def)
4187           /* Option1: the first element is '0' or '1' as well.  */
4188           init_def = gimple_build_vector_from_val (&stmts, vectype,
4189                                                    def_for_init);
4190         else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4191           {
4192             /* Option2 (variable length): the first element is INIT_VAL.  */
4193             init_def = gimple_build_vector_from_val (&stmts, vectype,
4194                                                      def_for_init);
4195             init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4196                                      vectype, init_def, init_val);
4197           }
4198         else
4199           {
4200             /* Option2: the first element is INIT_VAL.  */
4201             tree_vector_builder elts (vectype, 1, 2);
4202             elts.quick_push (init_val);
4203             elts.quick_push (def_for_init);
4204             init_def = gimple_build_vector (&stmts, &elts);
4205           }
4206       }
4207       break;
4208
4209     case MIN_EXPR:
4210     case MAX_EXPR:
4211     case COND_EXPR:
4212       {
4213         if (adjustment_def)
4214           {
4215             *adjustment_def = NULL_TREE;
4216             if (reduction_type != COND_REDUCTION
4217                 && reduction_type != EXTRACT_LAST_REDUCTION)
4218               {
4219                 init_def = vect_get_vec_def_for_operand (init_val, stmt);
4220                 break;
4221               }
4222           }
4223         init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4224         init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4225       }
4226       break;
4227
4228     default:
4229       gcc_unreachable ();
4230     }
4231
4232   if (stmts)
4233     gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4234   return init_def;
4235 }
4236
4237 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4238    NUMBER_OF_VECTORS is the number of vector defs to create.
4239    If NEUTRAL_OP is nonnull, introducing extra elements of that
4240    value will not change the result.  */
4241
4242 static void
4243 get_initial_defs_for_reduction (slp_tree slp_node,
4244                                 vec<tree> *vec_oprnds,
4245                                 unsigned int number_of_vectors,
4246                                 bool reduc_chain, tree neutral_op)
4247 {
4248   vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4249   gimple *stmt = stmts[0];
4250   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4251   unsigned HOST_WIDE_INT nunits;
4252   unsigned j, number_of_places_left_in_vector;
4253   tree vector_type;
4254   tree vop;
4255   int group_size = stmts.length ();
4256   unsigned int vec_num, i;
4257   unsigned number_of_copies = 1;
4258   vec<tree> voprnds;
4259   voprnds.create (number_of_vectors);
4260   struct loop *loop;
4261   auto_vec<tree, 16> permute_results;
4262
4263   vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4264
4265   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4266
4267   loop = (gimple_bb (stmt))->loop_father;
4268   gcc_assert (loop);
4269   edge pe = loop_preheader_edge (loop);
4270
4271   gcc_assert (!reduc_chain || neutral_op);
4272
4273   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4274      created vectors. It is greater than 1 if unrolling is performed.
4275
4276      For example, we have two scalar operands, s1 and s2 (e.g., group of
4277      strided accesses of size two), while NUNITS is four (i.e., four scalars
4278      of this type can be packed in a vector).  The output vector will contain
4279      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
4280      will be 2).
4281
4282      If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4283      vectors containing the operands.
4284
4285      For example, NUNITS is four as before, and the group size is 8
4286      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
4287      {s5, s6, s7, s8}.  */
4288
4289   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4290     nunits = group_size;
4291
4292   number_of_copies = nunits * number_of_vectors / group_size;
4293
4294   number_of_places_left_in_vector = nunits;
4295   bool constant_p = true;
4296   tree_vector_builder elts (vector_type, nunits, 1);
4297   elts.quick_grow (nunits);
4298   for (j = 0; j < number_of_copies; j++)
4299     {
4300       for (i = group_size - 1; stmts.iterate (i, &stmt); i--)
4301         {
4302           tree op;
4303           /* Get the def before the loop.  In reduction chain we have only
4304              one initial value.  */
4305           if ((j != (number_of_copies - 1)
4306                || (reduc_chain && i != 0))
4307               && neutral_op)
4308             op = neutral_op;
4309           else
4310             op = PHI_ARG_DEF_FROM_EDGE (stmt, pe);
4311
4312           /* Create 'vect_ = {op0,op1,...,opn}'.  */
4313           number_of_places_left_in_vector--;
4314           elts[number_of_places_left_in_vector] = op;
4315           if (!CONSTANT_CLASS_P (op))
4316             constant_p = false;
4317
4318           if (number_of_places_left_in_vector == 0)
4319             {
4320               gimple_seq ctor_seq = NULL;
4321               tree init;
4322               if (constant_p && !neutral_op
4323                   ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4324                   : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4325                 /* Build the vector directly from ELTS.  */
4326                 init = gimple_build_vector (&ctor_seq, &elts);
4327               else if (neutral_op)
4328                 {
4329                   /* Build a vector of the neutral value and shift the
4330                      other elements into place.  */
4331                   init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4332                                                        neutral_op);
4333                   int k = nunits;
4334                   while (k > 0 && elts[k - 1] == neutral_op)
4335                     k -= 1;
4336                   while (k > 0)
4337                     {
4338                       k -= 1;
4339                       init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4340                                            vector_type, init, elts[k]);
4341                     }
4342                 }
4343               else
4344                 {
4345                   /* First time round, duplicate ELTS to fill the
4346                      required number of vectors, then cherry pick the
4347                      appropriate result for each iteration.  */
4348                   if (vec_oprnds->is_empty ())
4349                     duplicate_and_interleave (&ctor_seq, vector_type, elts,
4350                                               number_of_vectors,
4351                                               permute_results);
4352                   init = permute_results[number_of_vectors - j - 1];
4353                 }
4354               if (ctor_seq != NULL)
4355                 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4356               voprnds.quick_push (init);
4357
4358               number_of_places_left_in_vector = nunits;
4359               elts.new_vector (vector_type, nunits, 1);
4360               elts.quick_grow (nunits);
4361               constant_p = true;
4362             }
4363         }
4364     }
4365
4366   /* Since the vectors are created in the reverse order, we should invert
4367      them.  */
4368   vec_num = voprnds.length ();
4369   for (j = vec_num; j != 0; j--)
4370     {
4371       vop = voprnds[j - 1];
4372       vec_oprnds->quick_push (vop);
4373     }
4374
4375   voprnds.release ();
4376
4377   /* In case that VF is greater than the unrolling factor needed for the SLP
4378      group of stmts, NUMBER_OF_VECTORS to be created is greater than
4379      NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
4380      to replicate the vectors.  */
4381   tree neutral_vec = NULL;
4382   while (number_of_vectors > vec_oprnds->length ())
4383     {
4384       if (neutral_op)
4385         {
4386           if (!neutral_vec)
4387             {
4388               gimple_seq ctor_seq = NULL;
4389               neutral_vec = gimple_build_vector_from_val
4390                 (&ctor_seq, vector_type, neutral_op);
4391               if (ctor_seq != NULL)
4392                 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4393             }
4394           vec_oprnds->quick_push (neutral_vec);
4395         }
4396       else
4397         {
4398           for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
4399             vec_oprnds->quick_push (vop);
4400         }
4401     }
4402 }
4403
4404
4405 /* Function vect_create_epilog_for_reduction
4406
4407    Create code at the loop-epilog to finalize the result of a reduction
4408    computation.
4409
4410    VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4411      reduction statements.
4412    STMT is the scalar reduction stmt that is being vectorized.
4413    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4414      number of elements that we can fit in a vectype (nunits).  In this case
4415      we have to generate more than one vector stmt - i.e - we need to "unroll"
4416      the vector stmt by a factor VF/nunits.  For more details see documentation
4417      in vectorizable_operation.
4418    REDUC_FN is the internal function for the epilog reduction.
4419    REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4420      computation.
4421    REDUC_INDEX is the index of the operand in the right hand side of the
4422      statement that is defined by REDUCTION_PHI.
4423    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4424    SLP_NODE is an SLP node containing a group of reduction statements. The
4425      first one in this group is STMT.
4426    INDUC_VAL is for INTEGER_INDUC_COND_REDUCTION the value to use for the case
4427      when the COND_EXPR is never true in the loop.  For MAX_EXPR, it needs to
4428      be smaller than any value of the IV in the loop, for MIN_EXPR larger than
4429      any value of the IV in the loop.
4430    INDUC_CODE is the code for epilog reduction if INTEGER_INDUC_COND_REDUCTION.
4431    NEUTRAL_OP is the value given by neutral_op_for_slp_reduction; it is
4432      null if this is not an SLP reduction
4433
4434    This function:
4435    1. Creates the reduction def-use cycles: sets the arguments for
4436       REDUCTION_PHIS:
4437       The loop-entry argument is the vectorized initial-value of the reduction.
4438       The loop-latch argument is taken from VECT_DEFS - the vector of partial
4439       sums.
4440    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4441       by calling the function specified by REDUC_FN if available, or by
4442       other means (whole-vector shifts or a scalar loop).
4443       The function also creates a new phi node at the loop exit to preserve
4444       loop-closed form, as illustrated below.
4445
4446      The flow at the entry to this function:
4447
4448         loop:
4449           vec_def = phi <null, null>            # REDUCTION_PHI
4450           VECT_DEF = vector_stmt                # vectorized form of STMT
4451           s_loop = scalar_stmt                  # (scalar) STMT
4452         loop_exit:
4453           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4454           use <s_out0>
4455           use <s_out0>
4456
4457      The above is transformed by this function into:
4458
4459         loop:
4460           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4461           VECT_DEF = vector_stmt                # vectorized form of STMT
4462           s_loop = scalar_stmt                  # (scalar) STMT
4463         loop_exit:
4464           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4465           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4466           v_out2 = reduce <v_out1>
4467           s_out3 = extract_field <v_out2, 0>
4468           s_out4 = adjust_result <s_out3>
4469           use <s_out4>
4470           use <s_out4>
4471 */
4472
4473 static void
4474 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
4475                                   gimple *reduc_def_stmt,
4476                                   int ncopies, internal_fn reduc_fn,
4477                                   vec<gimple *> reduction_phis,
4478                                   bool double_reduc,
4479                                   slp_tree slp_node,
4480                                   slp_instance slp_node_instance,
4481                                   tree induc_val, enum tree_code induc_code,
4482                                   tree neutral_op)
4483 {
4484   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4485   stmt_vec_info prev_phi_info;
4486   tree vectype;
4487   machine_mode mode;
4488   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4489   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4490   basic_block exit_bb;
4491   tree scalar_dest;
4492   tree scalar_type;
4493   gimple *new_phi = NULL, *phi;
4494   gimple_stmt_iterator exit_gsi;
4495   tree vec_dest;
4496   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4497   gimple *epilog_stmt = NULL;
4498   enum tree_code code = gimple_assign_rhs_code (stmt);
4499   gimple *exit_phi;
4500   tree bitsize;
4501   tree adjustment_def = NULL;
4502   tree vec_initial_def = NULL;
4503   tree expr, def, initial_def = NULL;
4504   tree orig_name, scalar_result;
4505   imm_use_iterator imm_iter, phi_imm_iter;
4506   use_operand_p use_p, phi_use_p;
4507   gimple *use_stmt, *orig_stmt, *reduction_phi = NULL;
4508   bool nested_in_vect_loop = false;
4509   auto_vec<gimple *> new_phis;
4510   auto_vec<gimple *> inner_phis;
4511   enum vect_def_type dt = vect_unknown_def_type;
4512   int j, i;
4513   auto_vec<tree> scalar_results;
4514   unsigned int group_size = 1, k, ratio;
4515   auto_vec<tree> vec_initial_defs;
4516   auto_vec<gimple *> phis;
4517   bool slp_reduc = false;
4518   bool direct_slp_reduc;
4519   tree new_phi_result;
4520   gimple *inner_phi = NULL;
4521   tree induction_index = NULL_TREE;
4522
4523   if (slp_node)
4524     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4525
4526   if (nested_in_vect_loop_p (loop, stmt))
4527     {
4528       outer_loop = loop;
4529       loop = loop->inner;
4530       nested_in_vect_loop = true;
4531       gcc_assert (!slp_node);
4532     }
4533
4534   vectype = STMT_VINFO_VECTYPE (stmt_info);
4535   gcc_assert (vectype);
4536   mode = TYPE_MODE (vectype);
4537
4538   /* 1. Create the reduction def-use cycle:
4539      Set the arguments of REDUCTION_PHIS, i.e., transform
4540
4541         loop:
4542           vec_def = phi <null, null>            # REDUCTION_PHI
4543           VECT_DEF = vector_stmt                # vectorized form of STMT
4544           ...
4545
4546      into:
4547
4548         loop:
4549           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4550           VECT_DEF = vector_stmt                # vectorized form of STMT
4551           ...
4552
4553      (in case of SLP, do it for all the phis). */
4554
4555   /* Get the loop-entry arguments.  */
4556   enum vect_def_type initial_def_dt = vect_unknown_def_type;
4557   if (slp_node)
4558     {
4559       unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4560       vec_initial_defs.reserve (vec_num);
4561       get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
4562                                       &vec_initial_defs, vec_num,
4563                                       REDUC_GROUP_FIRST_ELEMENT (stmt_info),
4564                                       neutral_op);
4565     }
4566   else
4567     {
4568       /* Get at the scalar def before the loop, that defines the initial value
4569          of the reduction variable.  */
4570       gimple *def_stmt;
4571       initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4572                                            loop_preheader_edge (loop));
4573       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
4574          and we can't use zero for induc_val, use initial_def.  Similarly
4575          for REDUC_MIN and initial_def larger than the base.  */
4576       if (TREE_CODE (initial_def) == INTEGER_CST
4577           && (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4578               == INTEGER_INDUC_COND_REDUCTION)
4579           && !integer_zerop (induc_val)
4580           && ((induc_code == MAX_EXPR
4581                && tree_int_cst_lt (initial_def, induc_val))
4582               || (induc_code == MIN_EXPR
4583                   && tree_int_cst_lt (induc_val, initial_def))))
4584         induc_val = initial_def;
4585       vect_is_simple_use (initial_def, loop_vinfo, &def_stmt, &initial_def_dt);
4586       vec_initial_def = get_initial_def_for_reduction (stmt, initial_def,
4587                                                        &adjustment_def);
4588       vec_initial_defs.create (1);
4589       vec_initial_defs.quick_push (vec_initial_def);
4590     }
4591
4592   /* Set phi nodes arguments.  */
4593   FOR_EACH_VEC_ELT (reduction_phis, i, phi)
4594     {
4595       tree vec_init_def = vec_initial_defs[i];
4596       tree def = vect_defs[i];
4597       for (j = 0; j < ncopies; j++)
4598         {
4599           if (j != 0)
4600             {
4601               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4602               if (nested_in_vect_loop)
4603                 vec_init_def
4604                   = vect_get_vec_def_for_stmt_copy (initial_def_dt,
4605                                                     vec_init_def);
4606             }
4607
4608           /* Set the loop-entry arg of the reduction-phi.  */
4609
4610           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4611               == INTEGER_INDUC_COND_REDUCTION)
4612             {
4613               /* Initialise the reduction phi to zero.  This prevents initial
4614                  values of non-zero interferring with the reduction op.  */
4615               gcc_assert (ncopies == 1);
4616               gcc_assert (i == 0);
4617
4618               tree vec_init_def_type = TREE_TYPE (vec_init_def);
4619               tree induc_val_vec
4620                 = build_vector_from_val (vec_init_def_type, induc_val);
4621
4622               add_phi_arg (as_a <gphi *> (phi), induc_val_vec,
4623                            loop_preheader_edge (loop), UNKNOWN_LOCATION);
4624             }
4625           else
4626             add_phi_arg (as_a <gphi *> (phi), vec_init_def,
4627                          loop_preheader_edge (loop), UNKNOWN_LOCATION);
4628
4629           /* Set the loop-latch arg for the reduction-phi.  */
4630           if (j > 0)
4631             def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
4632
4633           add_phi_arg (as_a <gphi *> (phi), def, loop_latch_edge (loop),
4634                        UNKNOWN_LOCATION);
4635
4636           if (dump_enabled_p ())
4637             {
4638               dump_printf_loc (MSG_NOTE, vect_location,
4639                                "transform reduction: created def-use cycle: ");
4640               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
4641               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
4642             }
4643         }
4644     }
4645
4646   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4647      which is updated with the current index of the loop for every match of
4648      the original loop's cond_expr (VEC_STMT).  This results in a vector
4649      containing the last time the condition passed for that vector lane.
4650      The first match will be a 1 to allow 0 to be used for non-matching
4651      indexes.  If there are no matches at all then the vector will be all
4652      zeroes.  */
4653   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4654     {
4655       tree indx_before_incr, indx_after_incr;
4656       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4657
4658       gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4659       gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4660
4661       int scalar_precision
4662         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4663       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4664       tree cr_index_vector_type = build_vector_type
4665         (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype));
4666
4667       /* First we create a simple vector induction variable which starts
4668          with the values {1,2,3,...} (SERIES_VECT) and increments by the
4669          vector size (STEP).  */
4670
4671       /* Create a {1,2,3,...} vector.  */
4672       tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
4673
4674       /* Create a vector of the step value.  */
4675       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4676       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4677
4678       /* Create an induction variable.  */
4679       gimple_stmt_iterator incr_gsi;
4680       bool insert_after;
4681       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4682       create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4683                  insert_after, &indx_before_incr, &indx_after_incr);
4684
4685       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4686          filled with zeros (VEC_ZERO).  */
4687
4688       /* Create a vector of 0s.  */
4689       tree zero = build_zero_cst (cr_index_scalar_type);
4690       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4691
4692       /* Create a vector phi node.  */
4693       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4694       new_phi = create_phi_node (new_phi_tree, loop->header);
4695       set_vinfo_for_stmt (new_phi,
4696                           new_stmt_vec_info (new_phi, loop_vinfo));
4697       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4698                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
4699
4700       /* Now take the condition from the loops original cond_expr
4701          (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
4702          every match uses values from the induction variable
4703          (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4704          (NEW_PHI_TREE).
4705          Finally, we update the phi (NEW_PHI_TREE) to take the value of
4706          the new cond_expr (INDEX_COND_EXPR).  */
4707
4708       /* Duplicate the condition from vec_stmt.  */
4709       tree ccompare = unshare_expr (gimple_assign_rhs1 (vec_stmt));
4710
4711       /* Create a conditional, where the condition is taken from vec_stmt
4712          (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and
4713          else is the phi (NEW_PHI_TREE).  */
4714       tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
4715                                      ccompare, indx_before_incr,
4716                                      new_phi_tree);
4717       induction_index = make_ssa_name (cr_index_vector_type);
4718       gimple *index_condition = gimple_build_assign (induction_index,
4719                                                      index_cond_expr);
4720       gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
4721       stmt_vec_info index_vec_info = new_stmt_vec_info (index_condition,
4722                                                         loop_vinfo);
4723       STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
4724       set_vinfo_for_stmt (index_condition, index_vec_info);
4725
4726       /* Update the phi with the vec cond.  */
4727       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4728                    loop_latch_edge (loop), UNKNOWN_LOCATION);
4729     }
4730
4731   /* 2. Create epilog code.
4732         The reduction epilog code operates across the elements of the vector
4733         of partial results computed by the vectorized loop.
4734         The reduction epilog code consists of:
4735
4736         step 1: compute the scalar result in a vector (v_out2)
4737         step 2: extract the scalar result (s_out3) from the vector (v_out2)
4738         step 3: adjust the scalar result (s_out3) if needed.
4739
4740         Step 1 can be accomplished using one the following three schemes:
4741           (scheme 1) using reduc_fn, if available.
4742           (scheme 2) using whole-vector shifts, if available.
4743           (scheme 3) using a scalar loop. In this case steps 1+2 above are
4744                      combined.
4745
4746           The overall epilog code looks like this:
4747
4748           s_out0 = phi <s_loop>         # original EXIT_PHI
4749           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
4750           v_out2 = reduce <v_out1>              # step 1
4751           s_out3 = extract_field <v_out2, 0>    # step 2
4752           s_out4 = adjust_result <s_out3>       # step 3
4753
4754           (step 3 is optional, and steps 1 and 2 may be combined).
4755           Lastly, the uses of s_out0 are replaced by s_out4.  */
4756
4757
4758   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4759          v_out1 = phi <VECT_DEF>
4760          Store them in NEW_PHIS.  */
4761
4762   exit_bb = single_exit (loop)->dest;
4763   prev_phi_info = NULL;
4764   new_phis.create (vect_defs.length ());
4765   FOR_EACH_VEC_ELT (vect_defs, i, def)
4766     {
4767       for (j = 0; j < ncopies; j++)
4768         {
4769           tree new_def = copy_ssa_name (def);
4770           phi = create_phi_node (new_def, exit_bb);
4771           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo));
4772           if (j == 0)
4773             new_phis.quick_push (phi);
4774           else
4775             {
4776               def = vect_get_vec_def_for_stmt_copy (dt, def);
4777               STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
4778             }
4779
4780           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4781           prev_phi_info = vinfo_for_stmt (phi);
4782         }
4783     }
4784
4785   /* The epilogue is created for the outer-loop, i.e., for the loop being
4786      vectorized.  Create exit phis for the outer loop.  */
4787   if (double_reduc)
4788     {
4789       loop = outer_loop;
4790       exit_bb = single_exit (loop)->dest;
4791       inner_phis.create (vect_defs.length ());
4792       FOR_EACH_VEC_ELT (new_phis, i, phi)
4793         {
4794           tree new_result = copy_ssa_name (PHI_RESULT (phi));
4795           gphi *outer_phi = create_phi_node (new_result, exit_bb);
4796           SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4797                            PHI_RESULT (phi));
4798           set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4799                                                             loop_vinfo));
4800           inner_phis.quick_push (phi);
4801           new_phis[i] = outer_phi;
4802           prev_phi_info = vinfo_for_stmt (outer_phi);
4803           while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
4804             {
4805               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4806               new_result = copy_ssa_name (PHI_RESULT (phi));
4807               outer_phi = create_phi_node (new_result, exit_bb);
4808               SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4809                                PHI_RESULT (phi));
4810               set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4811                                                                 loop_vinfo));
4812               STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
4813               prev_phi_info = vinfo_for_stmt (outer_phi);
4814             }
4815         }
4816     }
4817
4818   exit_gsi = gsi_after_labels (exit_bb);
4819
4820   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4821          (i.e. when reduc_fn is not available) and in the final adjustment
4822          code (if needed).  Also get the original scalar reduction variable as
4823          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
4824          represents a reduction pattern), the tree-code and scalar-def are
4825          taken from the original stmt that the pattern-stmt (STMT) replaces.
4826          Otherwise (it is a regular reduction) - the tree-code and scalar-def
4827          are taken from STMT.  */
4828
4829   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4830   if (!orig_stmt)
4831     {
4832       /* Regular reduction  */
4833       orig_stmt = stmt;
4834     }
4835   else
4836     {
4837       /* Reduction pattern  */
4838       stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
4839       gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
4840       gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
4841     }
4842
4843   code = gimple_assign_rhs_code (orig_stmt);
4844   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4845      partial results are added and not subtracted.  */
4846   if (code == MINUS_EXPR)
4847     code = PLUS_EXPR;
4848
4849   scalar_dest = gimple_assign_lhs (orig_stmt);
4850   scalar_type = TREE_TYPE (scalar_dest);
4851   scalar_results.create (group_size);
4852   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4853   bitsize = TYPE_SIZE (scalar_type);
4854
4855   /* In case this is a reduction in an inner-loop while vectorizing an outer
4856      loop - we don't need to extract a single scalar result at the end of the
4857      inner-loop (unless it is double reduction, i.e., the use of reduction is
4858      outside the outer-loop).  The final vector of partial results will be used
4859      in the vectorized outer-loop, or reduced to a scalar result at the end of
4860      the outer-loop.  */
4861   if (nested_in_vect_loop && !double_reduc)
4862     goto vect_finalize_reduction;
4863
4864   /* SLP reduction without reduction chain, e.g.,
4865      # a1 = phi <a2, a0>
4866      # b1 = phi <b2, b0>
4867      a2 = operation (a1)
4868      b2 = operation (b1)  */
4869   slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
4870
4871   /* True if we should implement SLP_REDUC using native reduction operations
4872      instead of scalar operations.  */
4873   direct_slp_reduc = (reduc_fn != IFN_LAST
4874                       && slp_reduc
4875                       && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
4876
4877   /* In case of reduction chain, e.g.,
4878      # a1 = phi <a3, a0>
4879      a2 = operation (a1)
4880      a3 = operation (a2),
4881
4882      we may end up with more than one vector result.  Here we reduce them to
4883      one vector.  */
4884   if (REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) || direct_slp_reduc)
4885     {
4886       tree first_vect = PHI_RESULT (new_phis[0]);
4887       gassign *new_vec_stmt = NULL;
4888       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4889       for (k = 1; k < new_phis.length (); k++)
4890         {
4891           gimple *next_phi = new_phis[k];
4892           tree second_vect = PHI_RESULT (next_phi);
4893           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4894           new_vec_stmt = gimple_build_assign (tem, code,
4895                                               first_vect, second_vect);
4896           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4897           first_vect = tem;
4898         }
4899
4900       new_phi_result = first_vect;
4901       if (new_vec_stmt)
4902         {
4903           new_phis.truncate (0);
4904           new_phis.safe_push (new_vec_stmt);
4905         }
4906     }
4907   /* Likewise if we couldn't use a single defuse cycle.  */
4908   else if (ncopies > 1)
4909     {
4910       gcc_assert (new_phis.length () == 1);
4911       tree first_vect = PHI_RESULT (new_phis[0]);
4912       gassign *new_vec_stmt = NULL;
4913       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4914       gimple *next_phi = new_phis[0];
4915       for (int k = 1; k < ncopies; ++k)
4916         {
4917           next_phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next_phi));
4918           tree second_vect = PHI_RESULT (next_phi);
4919           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4920           new_vec_stmt = gimple_build_assign (tem, code,
4921                                               first_vect, second_vect);
4922           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4923           first_vect = tem;
4924         }
4925       new_phi_result = first_vect;
4926       new_phis.truncate (0);
4927       new_phis.safe_push (new_vec_stmt);
4928     }
4929   else
4930     new_phi_result = PHI_RESULT (new_phis[0]);
4931
4932   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4933       && reduc_fn != IFN_LAST)
4934     {
4935       /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4936          various data values where the condition matched and another vector
4937          (INDUCTION_INDEX) containing all the indexes of those matches.  We
4938          need to extract the last matching index (which will be the index with
4939          highest value) and use this to index into the data vector.
4940          For the case where there were no matches, the data vector will contain
4941          all default values and the index vector will be all zeros.  */
4942
4943       /* Get various versions of the type of the vector of indexes.  */
4944       tree index_vec_type = TREE_TYPE (induction_index);
4945       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4946       tree index_scalar_type = TREE_TYPE (index_vec_type);
4947       tree index_vec_cmp_type = build_same_sized_truth_vector_type
4948         (index_vec_type);
4949
4950       /* Get an unsigned integer version of the type of the data vector.  */
4951       int scalar_precision
4952         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
4953       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4954       tree vectype_unsigned = build_vector_type
4955         (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4956
4957       /* First we need to create a vector (ZERO_VEC) of zeros and another
4958          vector (MAX_INDEX_VEC) filled with the last matching index, which we
4959          can create using a MAX reduction and then expanding.
4960          In the case where the loop never made any matches, the max index will
4961          be zero.  */
4962
4963       /* Vector of {0, 0, 0,...}.  */
4964       tree zero_vec = make_ssa_name (vectype);
4965       tree zero_vec_rhs = build_zero_cst (vectype);
4966       gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
4967       gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
4968
4969       /* Find maximum value from the vector of found indexes.  */
4970       tree max_index = make_ssa_name (index_scalar_type);
4971       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4972                                                           1, induction_index);
4973       gimple_call_set_lhs (max_index_stmt, max_index);
4974       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
4975
4976       /* Vector of {max_index, max_index, max_index,...}.  */
4977       tree max_index_vec = make_ssa_name (index_vec_type);
4978       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
4979                                                       max_index);
4980       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
4981                                                         max_index_vec_rhs);
4982       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
4983
4984       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
4985          with the vector (INDUCTION_INDEX) of found indexes, choosing values
4986          from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
4987          otherwise.  Only one value should match, resulting in a vector
4988          (VEC_COND) with one data value and the rest zeros.
4989          In the case where the loop never made any matches, every index will
4990          match, resulting in a vector with all data values (which will all be
4991          the default value).  */
4992
4993       /* Compare the max index vector to the vector of found indexes to find
4994          the position of the max value.  */
4995       tree vec_compare = make_ssa_name (index_vec_cmp_type);
4996       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
4997                                                       induction_index,
4998                                                       max_index_vec);
4999       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
5000
5001       /* Use the compare to choose either values from the data vector or
5002          zero.  */
5003       tree vec_cond = make_ssa_name (vectype);
5004       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
5005                                                    vec_compare, new_phi_result,
5006                                                    zero_vec);
5007       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
5008
5009       /* Finally we need to extract the data value from the vector (VEC_COND)
5010          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
5011          reduction, but because this doesn't exist, we can use a MAX reduction
5012          instead.  The data value might be signed or a float so we need to cast
5013          it first.
5014          In the case where the loop never made any matches, the data values are
5015          all identical, and so will reduce down correctly.  */
5016
5017       /* Make the matched data values unsigned.  */
5018       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
5019       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
5020                                        vec_cond);
5021       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
5022                                                         VIEW_CONVERT_EXPR,
5023                                                         vec_cond_cast_rhs);
5024       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
5025
5026       /* Reduce down to a scalar value.  */
5027       tree data_reduc = make_ssa_name (scalar_type_unsigned);
5028       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5029                                                            1, vec_cond_cast);
5030       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
5031       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
5032
5033       /* Convert the reduced value back to the result type and set as the
5034          result.  */
5035       gimple_seq stmts = NULL;
5036       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
5037                                data_reduc);
5038       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5039       scalar_results.safe_push (new_temp);
5040     }
5041   else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
5042            && reduc_fn == IFN_LAST)
5043     {
5044       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
5045          idx = 0;
5046          idx_val = induction_index[0];
5047          val = data_reduc[0];
5048          for (idx = 0, val = init, i = 0; i < nelts; ++i)
5049            if (induction_index[i] > idx_val)
5050              val = data_reduc[i], idx_val = induction_index[i];
5051          return val;  */
5052
5053       tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
5054       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5055       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5056       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5057       /* Enforced by vectorizable_reduction, which ensures we have target
5058          support before allowing a conditional reduction on variable-length
5059          vectors.  */
5060       unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5061       tree idx_val = NULL_TREE, val = NULL_TREE;
5062       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5063         {
5064           tree old_idx_val = idx_val;
5065           tree old_val = val;
5066           idx_val = make_ssa_name (idx_eltype);
5067           epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5068                                              build3 (BIT_FIELD_REF, idx_eltype,
5069                                                      induction_index,
5070                                                      bitsize_int (el_size),
5071                                                      bitsize_int (off)));
5072           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5073           val = make_ssa_name (data_eltype);
5074           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5075                                              build3 (BIT_FIELD_REF,
5076                                                      data_eltype,
5077                                                      new_phi_result,
5078                                                      bitsize_int (el_size),
5079                                                      bitsize_int (off)));
5080           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5081           if (off != 0)
5082             {
5083               tree new_idx_val = idx_val;
5084               tree new_val = val;
5085               if (off != v_size - el_size)
5086                 {
5087                   new_idx_val = make_ssa_name (idx_eltype);
5088                   epilog_stmt = gimple_build_assign (new_idx_val,
5089                                                      MAX_EXPR, idx_val,
5090                                                      old_idx_val);
5091                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5092                 }
5093               new_val = make_ssa_name (data_eltype);
5094               epilog_stmt = gimple_build_assign (new_val,
5095                                                  COND_EXPR,
5096                                                  build2 (GT_EXPR,
5097                                                          boolean_type_node,
5098                                                          idx_val,
5099                                                          old_idx_val),
5100                                                  val, old_val);
5101               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5102               idx_val = new_idx_val;
5103               val = new_val;
5104             }
5105         }
5106       /* Convert the reduced value back to the result type and set as the
5107          result.  */
5108       gimple_seq stmts = NULL;
5109       val = gimple_convert (&stmts, scalar_type, val);
5110       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5111       scalar_results.safe_push (val);
5112     }
5113
5114   /* 2.3 Create the reduction code, using one of the three schemes described
5115          above. In SLP we simply need to extract all the elements from the
5116          vector (without reducing them), so we use scalar shifts.  */
5117   else if (reduc_fn != IFN_LAST && !slp_reduc)
5118     {
5119       tree tmp;
5120       tree vec_elem_type;
5121
5122       /* Case 1:  Create:
5123          v_out2 = reduc_expr <v_out1>  */
5124
5125       if (dump_enabled_p ())
5126         dump_printf_loc (MSG_NOTE, vect_location,
5127                          "Reduce using direct vector reduction.\n");
5128
5129       vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
5130       if (!useless_type_conversion_p (scalar_type, vec_elem_type))
5131         {
5132           tree tmp_dest
5133             = vect_create_destination_var (scalar_dest, vec_elem_type);
5134           epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5135                                                     new_phi_result);
5136           gimple_set_lhs (epilog_stmt, tmp_dest);
5137           new_temp = make_ssa_name (tmp_dest, epilog_stmt);
5138           gimple_set_lhs (epilog_stmt, new_temp);
5139           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5140
5141           epilog_stmt = gimple_build_assign (new_scalar_dest, NOP_EXPR,
5142                                              new_temp);
5143         }
5144       else
5145         {
5146           epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5147                                                     new_phi_result);
5148           gimple_set_lhs (epilog_stmt, new_scalar_dest);
5149         }
5150
5151       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5152       gimple_set_lhs (epilog_stmt, new_temp);
5153       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5154
5155       if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5156            == INTEGER_INDUC_COND_REDUCTION)
5157           && !operand_equal_p (initial_def, induc_val, 0))
5158         {
5159           /* Earlier we set the initial value to be a vector if induc_val
5160              values.  Check the result and if it is induc_val then replace
5161              with the original initial value, unless induc_val is
5162              the same as initial_def already.  */
5163           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5164                                   induc_val);
5165
5166           tmp = make_ssa_name (new_scalar_dest);
5167           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5168                                              initial_def, new_temp);
5169           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5170           new_temp = tmp;
5171         }
5172
5173       scalar_results.safe_push (new_temp);
5174     }
5175   else if (direct_slp_reduc)
5176     {
5177       /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5178          with the elements for other SLP statements replaced with the
5179          neutral value.  We can then do a normal reduction on each vector.  */
5180
5181       /* Enforced by vectorizable_reduction.  */
5182       gcc_assert (new_phis.length () == 1);
5183       gcc_assert (pow2p_hwi (group_size));
5184
5185       slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
5186       vec<gimple *> orig_phis = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
5187       gimple_seq seq = NULL;
5188
5189       /* Build a vector {0, 1, 2, ...}, with the same number of elements
5190          and the same element size as VECTYPE.  */
5191       tree index = build_index_vector (vectype, 0, 1);
5192       tree index_type = TREE_TYPE (index);
5193       tree index_elt_type = TREE_TYPE (index_type);
5194       tree mask_type = build_same_sized_truth_vector_type (index_type);
5195
5196       /* Create a vector that, for each element, identifies which of
5197          the REDUC_GROUP_SIZE results should use it.  */
5198       tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5199       index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5200                             build_vector_from_val (index_type, index_mask));
5201
5202       /* Get a neutral vector value.  This is simply a splat of the neutral
5203          scalar value if we have one, otherwise the initial scalar value
5204          is itself a neutral value.  */
5205       tree vector_identity = NULL_TREE;
5206       if (neutral_op)
5207         vector_identity = gimple_build_vector_from_val (&seq, vectype,
5208                                                         neutral_op);
5209       for (unsigned int i = 0; i < group_size; ++i)
5210         {
5211           /* If there's no univeral neutral value, we can use the
5212              initial scalar value from the original PHI.  This is used
5213              for MIN and MAX reduction, for example.  */
5214           if (!neutral_op)
5215             {
5216               tree scalar_value
5217                 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i],
5218                                          loop_preheader_edge (loop));
5219               vector_identity = gimple_build_vector_from_val (&seq, vectype,
5220                                                               scalar_value);
5221             }
5222
5223           /* Calculate the equivalent of:
5224
5225              sel[j] = (index[j] == i);
5226
5227              which selects the elements of NEW_PHI_RESULT that should
5228              be included in the result.  */
5229           tree compare_val = build_int_cst (index_elt_type, i);
5230           compare_val = build_vector_from_val (index_type, compare_val);
5231           tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5232                                    index, compare_val);
5233
5234           /* Calculate the equivalent of:
5235
5236              vec = seq ? new_phi_result : vector_identity;
5237
5238              VEC is now suitable for a full vector reduction.  */
5239           tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5240                                    sel, new_phi_result, vector_identity);
5241
5242           /* Do the reduction and convert it to the appropriate type.  */
5243           tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5244                                       TREE_TYPE (vectype), vec);
5245           scalar = gimple_convert (&seq, scalar_type, scalar);
5246           scalar_results.safe_push (scalar);
5247         }
5248       gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5249     }
5250   else
5251     {
5252       bool reduce_with_shift;
5253       tree vec_temp;
5254
5255       /* COND reductions all do the final reduction with MAX_EXPR
5256          or MIN_EXPR.  */
5257       if (code == COND_EXPR)
5258         {
5259           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5260               == INTEGER_INDUC_COND_REDUCTION)
5261             code = induc_code;
5262           else
5263             code = MAX_EXPR;
5264         }
5265
5266       /* See if the target wants to do the final (shift) reduction
5267          in a vector mode of smaller size and first reduce upper/lower
5268          halves against each other.  */
5269       enum machine_mode mode1 = mode;
5270       tree vectype1 = vectype;
5271       unsigned sz = tree_to_uhwi (TYPE_SIZE_UNIT (vectype));
5272       unsigned sz1 = sz;
5273       if (!slp_reduc
5274           && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5275         sz1 = GET_MODE_SIZE (mode1).to_constant ();
5276
5277       vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz1);
5278       reduce_with_shift = have_whole_vector_shift (mode1);
5279       if (!VECTOR_MODE_P (mode1))
5280         reduce_with_shift = false;
5281       else
5282         {
5283           optab optab = optab_for_tree_code (code, vectype1, optab_default);
5284           if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5285             reduce_with_shift = false;
5286         }
5287
5288       /* First reduce the vector to the desired vector size we should
5289          do shift reduction on by combining upper and lower halves.  */
5290       new_temp = new_phi_result;
5291       while (sz > sz1)
5292         {
5293           gcc_assert (!slp_reduc);
5294           sz /= 2;
5295           vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz);
5296
5297           /* The target has to make sure we support lowpart/highpart
5298              extraction, either via direct vector extract or through
5299              an integer mode punning.  */
5300           tree dst1, dst2;
5301           if (convert_optab_handler (vec_extract_optab,
5302                                      TYPE_MODE (TREE_TYPE (new_temp)),
5303                                      TYPE_MODE (vectype1))
5304               != CODE_FOR_nothing)
5305             {
5306               /* Extract sub-vectors directly once vec_extract becomes
5307                  a conversion optab.  */
5308               dst1 = make_ssa_name (vectype1);
5309               epilog_stmt
5310                   = gimple_build_assign (dst1, BIT_FIELD_REF,
5311                                          build3 (BIT_FIELD_REF, vectype1,
5312                                                  new_temp, TYPE_SIZE (vectype1),
5313                                                  bitsize_int (0)));
5314               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5315               dst2 =  make_ssa_name (vectype1);
5316               epilog_stmt
5317                   = gimple_build_assign (dst2, BIT_FIELD_REF,
5318                                          build3 (BIT_FIELD_REF, vectype1,
5319                                                  new_temp, TYPE_SIZE (vectype1),
5320                                                  bitsize_int (sz * BITS_PER_UNIT)));
5321               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5322             }
5323           else
5324             {
5325               /* Extract via punning to appropriately sized integer mode
5326                  vector.  */
5327               tree eltype = build_nonstandard_integer_type (sz * BITS_PER_UNIT,
5328                                                             1);
5329               tree etype = build_vector_type (eltype, 2);
5330               gcc_assert (convert_optab_handler (vec_extract_optab,
5331                                                  TYPE_MODE (etype),
5332                                                  TYPE_MODE (eltype))
5333                           != CODE_FOR_nothing);
5334               tree tem = make_ssa_name (etype);
5335               epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5336                                                  build1 (VIEW_CONVERT_EXPR,
5337                                                          etype, new_temp));
5338               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5339               new_temp = tem;
5340               tem = make_ssa_name (eltype);
5341               epilog_stmt
5342                   = gimple_build_assign (tem, BIT_FIELD_REF,
5343                                          build3 (BIT_FIELD_REF, eltype,
5344                                                  new_temp, TYPE_SIZE (eltype),
5345                                                  bitsize_int (0)));
5346               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5347               dst1 = make_ssa_name (vectype1);
5348               epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5349                                                  build1 (VIEW_CONVERT_EXPR,
5350                                                          vectype1, tem));
5351               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5352               tem = make_ssa_name (eltype);
5353               epilog_stmt
5354                   = gimple_build_assign (tem, BIT_FIELD_REF,
5355                                          build3 (BIT_FIELD_REF, eltype,
5356                                                  new_temp, TYPE_SIZE (eltype),
5357                                                  bitsize_int (sz * BITS_PER_UNIT)));
5358               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5359               dst2 =  make_ssa_name (vectype1);
5360               epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5361                                                  build1 (VIEW_CONVERT_EXPR,
5362                                                          vectype1, tem));
5363               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5364             }
5365
5366           new_temp = make_ssa_name (vectype1);
5367           epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5368           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5369         }
5370
5371       if (reduce_with_shift && !slp_reduc)
5372         {
5373           int element_bitsize = tree_to_uhwi (bitsize);
5374           /* Enforced by vectorizable_reduction, which disallows SLP reductions
5375              for variable-length vectors and also requires direct target support
5376              for loop reductions.  */
5377           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5378           int nelements = vec_size_in_bits / element_bitsize;
5379           vec_perm_builder sel;
5380           vec_perm_indices indices;
5381
5382           int elt_offset;
5383
5384           tree zero_vec = build_zero_cst (vectype1);
5385           /* Case 2: Create:
5386              for (offset = nelements/2; offset >= 1; offset/=2)
5387                 {
5388                   Create:  va' = vec_shift <va, offset>
5389                   Create:  va = vop <va, va'>
5390                 }  */
5391
5392           tree rhs;
5393
5394           if (dump_enabled_p ())
5395             dump_printf_loc (MSG_NOTE, vect_location,
5396                              "Reduce using vector shifts\n");
5397
5398           mode1 = TYPE_MODE (vectype1);
5399           vec_dest = vect_create_destination_var (scalar_dest, vectype1);
5400           for (elt_offset = nelements / 2;
5401                elt_offset >= 1;
5402                elt_offset /= 2)
5403             {
5404               calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5405               indices.new_vector (sel, 2, nelements);
5406               tree mask = vect_gen_perm_mask_any (vectype1, indices);
5407               epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
5408                                                  new_temp, zero_vec, mask);
5409               new_name = make_ssa_name (vec_dest, epilog_stmt);
5410               gimple_assign_set_lhs (epilog_stmt, new_name);
5411               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5412
5413               epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
5414                                                  new_temp);
5415               new_temp = make_ssa_name (vec_dest, epilog_stmt);
5416               gimple_assign_set_lhs (epilog_stmt, new_temp);
5417               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5418             }
5419
5420           /* 2.4  Extract the final scalar result.  Create:
5421              s_out3 = extract_field <v_out2, bitpos>  */
5422
5423           if (dump_enabled_p ())
5424             dump_printf_loc (MSG_NOTE, vect_location,
5425                              "extract scalar result\n");
5426
5427           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5428                         bitsize, bitsize_zero_node);
5429           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5430           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5431           gimple_assign_set_lhs (epilog_stmt, new_temp);
5432           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5433           scalar_results.safe_push (new_temp);
5434         }
5435       else
5436         {
5437           /* Case 3: Create:
5438              s = extract_field <v_out2, 0>
5439              for (offset = element_size;
5440                   offset < vector_size;
5441                   offset += element_size;)
5442                {
5443                  Create:  s' = extract_field <v_out2, offset>
5444                  Create:  s = op <s, s'>  // For non SLP cases
5445                }  */
5446
5447           if (dump_enabled_p ())
5448             dump_printf_loc (MSG_NOTE, vect_location,
5449                              "Reduce using scalar code.\n");
5450
5451           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5452           int element_bitsize = tree_to_uhwi (bitsize);
5453           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5454             {
5455               int bit_offset;
5456               if (gimple_code (new_phi) == GIMPLE_PHI)
5457                 vec_temp = PHI_RESULT (new_phi);
5458               else
5459                 vec_temp = gimple_assign_lhs (new_phi);
5460               tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
5461                                  bitsize_zero_node);
5462               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5463               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5464               gimple_assign_set_lhs (epilog_stmt, new_temp);
5465               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5466
5467               /* In SLP we don't need to apply reduction operation, so we just
5468                  collect s' values in SCALAR_RESULTS.  */
5469               if (slp_reduc)
5470                 scalar_results.safe_push (new_temp);
5471
5472               for (bit_offset = element_bitsize;
5473                    bit_offset < vec_size_in_bits;
5474                    bit_offset += element_bitsize)
5475                 {
5476                   tree bitpos = bitsize_int (bit_offset);
5477                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
5478                                      bitsize, bitpos);
5479
5480                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5481                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
5482                   gimple_assign_set_lhs (epilog_stmt, new_name);
5483                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5484
5485                   if (slp_reduc)
5486                     {
5487                       /* In SLP we don't need to apply reduction operation, so
5488                          we just collect s' values in SCALAR_RESULTS.  */
5489                       new_temp = new_name;
5490                       scalar_results.safe_push (new_name);
5491                     }
5492                   else
5493                     {
5494                       epilog_stmt = gimple_build_assign (new_scalar_dest, code,
5495                                                          new_name, new_temp);
5496                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5497                       gimple_assign_set_lhs (epilog_stmt, new_temp);
5498                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5499                     }
5500                 }
5501             }
5502
5503           /* The only case where we need to reduce scalar results in SLP, is
5504              unrolling.  If the size of SCALAR_RESULTS is greater than
5505              REDUC_GROUP_SIZE, we reduce them combining elements modulo
5506              REDUC_GROUP_SIZE.  */
5507           if (slp_reduc)
5508             {
5509               tree res, first_res, new_res;
5510               gimple *new_stmt;
5511
5512               /* Reduce multiple scalar results in case of SLP unrolling.  */
5513               for (j = group_size; scalar_results.iterate (j, &res);
5514                    j++)
5515                 {
5516                   first_res = scalar_results[j % group_size];
5517                   new_stmt = gimple_build_assign (new_scalar_dest, code,
5518                                                   first_res, res);
5519                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
5520                   gimple_assign_set_lhs (new_stmt, new_res);
5521                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
5522                   scalar_results[j % group_size] = new_res;
5523                 }
5524             }
5525           else
5526             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
5527             scalar_results.safe_push (new_temp);
5528         }
5529
5530       if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5531            == INTEGER_INDUC_COND_REDUCTION)
5532           && !operand_equal_p (initial_def, induc_val, 0))
5533         {
5534           /* Earlier we set the initial value to be a vector if induc_val
5535              values.  Check the result and if it is induc_val then replace
5536              with the original initial value, unless induc_val is
5537              the same as initial_def already.  */
5538           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5539                                   induc_val);
5540
5541           tree tmp = make_ssa_name (new_scalar_dest);
5542           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5543                                              initial_def, new_temp);
5544           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5545           scalar_results[0] = tmp;
5546         }
5547     }
5548
5549 vect_finalize_reduction:
5550
5551   if (double_reduc)
5552     loop = loop->inner;
5553
5554   /* 2.5 Adjust the final result by the initial value of the reduction
5555          variable. (When such adjustment is not needed, then
5556          'adjustment_def' is zero).  For example, if code is PLUS we create:
5557          new_temp = loop_exit_def + adjustment_def  */
5558
5559   if (adjustment_def)
5560     {
5561       gcc_assert (!slp_reduc);
5562       if (nested_in_vect_loop)
5563         {
5564           new_phi = new_phis[0];
5565           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
5566           expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
5567           new_dest = vect_create_destination_var (scalar_dest, vectype);
5568         }
5569       else
5570         {
5571           new_temp = scalar_results[0];
5572           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5573           expr = build2 (code, scalar_type, new_temp, adjustment_def);
5574           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
5575         }
5576
5577       epilog_stmt = gimple_build_assign (new_dest, expr);
5578       new_temp = make_ssa_name (new_dest, epilog_stmt);
5579       gimple_assign_set_lhs (epilog_stmt, new_temp);
5580       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5581       if (nested_in_vect_loop)
5582         {
5583           set_vinfo_for_stmt (epilog_stmt,
5584                               new_stmt_vec_info (epilog_stmt, loop_vinfo));
5585           STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
5586                 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
5587
5588           if (!double_reduc)
5589             scalar_results.quick_push (new_temp);
5590           else
5591             scalar_results[0] = new_temp;
5592         }
5593       else
5594         scalar_results[0] = new_temp;
5595
5596       new_phis[0] = epilog_stmt;
5597     }
5598
5599   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
5600           phis with new adjusted scalar results, i.e., replace use <s_out0>
5601           with use <s_out4>.
5602
5603      Transform:
5604         loop_exit:
5605           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5606           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5607           v_out2 = reduce <v_out1>
5608           s_out3 = extract_field <v_out2, 0>
5609           s_out4 = adjust_result <s_out3>
5610           use <s_out0>
5611           use <s_out0>
5612
5613      into:
5614
5615         loop_exit:
5616           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5617           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5618           v_out2 = reduce <v_out1>
5619           s_out3 = extract_field <v_out2, 0>
5620           s_out4 = adjust_result <s_out3>
5621           use <s_out4>
5622           use <s_out4> */
5623
5624
5625   /* In SLP reduction chain we reduce vector results into one vector if
5626      necessary, hence we set here REDUC_GROUP_SIZE to 1.  SCALAR_DEST is the
5627      LHS of the last stmt in the reduction chain, since we are looking for
5628      the loop exit phi node.  */
5629   if (REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
5630     {
5631       gimple *dest_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5632       /* Handle reduction patterns.  */
5633       if (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt)))
5634         dest_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt));
5635
5636       scalar_dest = gimple_assign_lhs (dest_stmt);
5637       group_size = 1;
5638     }
5639
5640   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5641      case that REDUC_GROUP_SIZE is greater than vectorization factor).
5642      Therefore, we need to match SCALAR_RESULTS with corresponding statements.
5643      The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results
5644      correspond to the first vector stmt, etc.
5645      (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)).  */
5646   if (group_size > new_phis.length ())
5647     {
5648       ratio = group_size / new_phis.length ();
5649       gcc_assert (!(group_size % new_phis.length ()));
5650     }
5651   else
5652     ratio = 1;
5653
5654   for (k = 0; k < group_size; k++)
5655     {
5656       if (k % ratio == 0)
5657         {
5658           epilog_stmt = new_phis[k / ratio];
5659           reduction_phi = reduction_phis[k / ratio];
5660           if (double_reduc)
5661             inner_phi = inner_phis[k / ratio];
5662         }
5663
5664       if (slp_reduc)
5665         {
5666           gimple *current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5667
5668           orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
5669           /* SLP statements can't participate in patterns.  */
5670           gcc_assert (!orig_stmt);
5671           scalar_dest = gimple_assign_lhs (current_stmt);
5672         }
5673
5674       phis.create (3);
5675       /* Find the loop-closed-use at the loop exit of the original scalar
5676          result.  (The reduction result is expected to have two immediate uses -
5677          one at the latch block, and one at the loop exit).  */
5678       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5679         if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
5680             && !is_gimple_debug (USE_STMT (use_p)))
5681           phis.safe_push (USE_STMT (use_p));
5682
5683       /* While we expect to have found an exit_phi because of loop-closed-ssa
5684          form we can end up without one if the scalar cycle is dead.  */
5685
5686       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5687         {
5688           if (outer_loop)
5689             {
5690               stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
5691               gphi *vect_phi;
5692
5693               /* FORNOW. Currently not supporting the case that an inner-loop
5694                  reduction is not used in the outer-loop (but only outside the
5695                  outer-loop), unless it is double reduction.  */
5696               gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5697                            && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
5698                           || double_reduc);
5699
5700               if (double_reduc)
5701                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
5702               else
5703                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
5704               if (!double_reduc
5705                   || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
5706                       != vect_double_reduction_def)
5707                 continue;
5708
5709               /* Handle double reduction:
5710
5711                  stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
5712                  stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
5713                  stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
5714                  stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
5715
5716                  At that point the regular reduction (stmt2 and stmt3) is
5717                  already vectorized, as well as the exit phi node, stmt4.
5718                  Here we vectorize the phi node of double reduction, stmt1, and
5719                  update all relevant statements.  */
5720
5721               /* Go through all the uses of s2 to find double reduction phi
5722                  node, i.e., stmt1 above.  */
5723               orig_name = PHI_RESULT (exit_phi);
5724               FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5725                 {
5726                   stmt_vec_info use_stmt_vinfo;
5727                   stmt_vec_info new_phi_vinfo;
5728                   tree vect_phi_init, preheader_arg, vect_phi_res;
5729                   basic_block bb = gimple_bb (use_stmt);
5730                   gimple *use;
5731
5732                   /* Check that USE_STMT is really double reduction phi
5733                      node.  */
5734                   if (gimple_code (use_stmt) != GIMPLE_PHI
5735                       || gimple_phi_num_args (use_stmt) != 2
5736                       || bb->loop_father != outer_loop)
5737                     continue;
5738                   use_stmt_vinfo = vinfo_for_stmt (use_stmt);
5739                   if (!use_stmt_vinfo
5740                       || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
5741                           != vect_double_reduction_def)
5742                     continue;
5743
5744                   /* Create vector phi node for double reduction:
5745                      vs1 = phi <vs0, vs2>
5746                      vs1 was created previously in this function by a call to
5747                        vect_get_vec_def_for_operand and is stored in
5748                        vec_initial_def;
5749                      vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
5750                      vs0 is created here.  */
5751
5752                   /* Create vector phi node.  */
5753                   vect_phi = create_phi_node (vec_initial_def, bb);
5754                   new_phi_vinfo = new_stmt_vec_info (vect_phi,
5755                                     loop_vec_info_for_loop (outer_loop));
5756                   set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
5757
5758                   /* Create vs0 - initial def of the double reduction phi.  */
5759                   preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
5760                                              loop_preheader_edge (outer_loop));
5761                   vect_phi_init = get_initial_def_for_reduction
5762                     (stmt, preheader_arg, NULL);
5763
5764                   /* Update phi node arguments with vs0 and vs2.  */
5765                   add_phi_arg (vect_phi, vect_phi_init,
5766                                loop_preheader_edge (outer_loop),
5767                                UNKNOWN_LOCATION);
5768                   add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
5769                                loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
5770                   if (dump_enabled_p ())
5771                     {
5772                       dump_printf_loc (MSG_NOTE, vect_location,
5773                                        "created double reduction phi node: ");
5774                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
5775                     }
5776
5777                   vect_phi_res = PHI_RESULT (vect_phi);
5778
5779                   /* Replace the use, i.e., set the correct vs1 in the regular
5780                      reduction phi node.  FORNOW, NCOPIES is always 1, so the
5781                      loop is redundant.  */
5782                   use = reduction_phi;
5783                   for (j = 0; j < ncopies; j++)
5784                     {
5785                       edge pr_edge = loop_preheader_edge (loop);
5786                       SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
5787                       use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
5788                     }
5789                 }
5790             }
5791         }
5792
5793       phis.release ();
5794       if (nested_in_vect_loop)
5795         {
5796           if (double_reduc)
5797             loop = outer_loop;
5798           else
5799             continue;
5800         }
5801
5802       phis.create (3);
5803       /* Find the loop-closed-use at the loop exit of the original scalar
5804          result.  (The reduction result is expected to have two immediate uses,
5805          one at the latch block, and one at the loop exit).  For double
5806          reductions we are looking for exit phis of the outer loop.  */
5807       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5808         {
5809           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5810             {
5811               if (!is_gimple_debug (USE_STMT (use_p)))
5812                 phis.safe_push (USE_STMT (use_p));
5813             }
5814           else
5815             {
5816               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5817                 {
5818                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
5819
5820                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5821                     {
5822                       if (!flow_bb_inside_loop_p (loop,
5823                                              gimple_bb (USE_STMT (phi_use_p)))
5824                           && !is_gimple_debug (USE_STMT (phi_use_p)))
5825                         phis.safe_push (USE_STMT (phi_use_p));
5826                     }
5827                 }
5828             }
5829         }
5830
5831       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5832         {
5833           /* Replace the uses:  */
5834           orig_name = PHI_RESULT (exit_phi);
5835           scalar_result = scalar_results[k];
5836           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5837             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5838               SET_USE (use_p, scalar_result);
5839         }
5840
5841       phis.release ();
5842     }
5843 }
5844
5845 /* Return a vector of type VECTYPE that is equal to the vector select
5846    operation "MASK ? VEC : IDENTITY".  Insert the select statements
5847    before GSI.  */
5848
5849 static tree
5850 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
5851                      tree vec, tree identity)
5852 {
5853   tree cond = make_temp_ssa_name (vectype, NULL, "cond");
5854   gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
5855                                           mask, vec, identity);
5856   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5857   return cond;
5858 }
5859
5860 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
5861    order, starting with LHS.  Insert the extraction statements before GSI and
5862    associate the new scalar SSA names with variable SCALAR_DEST.
5863    Return the SSA name for the result.  */
5864
5865 static tree
5866 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
5867                        tree_code code, tree lhs, tree vector_rhs)
5868 {
5869   tree vectype = TREE_TYPE (vector_rhs);
5870   tree scalar_type = TREE_TYPE (vectype);
5871   tree bitsize = TYPE_SIZE (scalar_type);
5872   unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5873   unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
5874
5875   for (unsigned HOST_WIDE_INT bit_offset = 0;
5876        bit_offset < vec_size_in_bits;
5877        bit_offset += element_bitsize)
5878     {
5879       tree bitpos = bitsize_int (bit_offset);
5880       tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
5881                          bitsize, bitpos);
5882
5883       gassign *stmt = gimple_build_assign (scalar_dest, rhs);
5884       rhs = make_ssa_name (scalar_dest, stmt);
5885       gimple_assign_set_lhs (stmt, rhs);
5886       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5887
5888       stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
5889       tree new_name = make_ssa_name (scalar_dest, stmt);
5890       gimple_assign_set_lhs (stmt, new_name);
5891       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5892       lhs = new_name;
5893     }
5894   return lhs;
5895 }
5896
5897 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT is the
5898    statement that sets the live-out value.  REDUC_DEF_STMT is the phi
5899    statement.  CODE is the operation performed by STMT and OPS are
5900    its scalar operands.  REDUC_INDEX is the index of the operand in
5901    OPS that is set by REDUC_DEF_STMT.  REDUC_FN is the function that
5902    implements in-order reduction, or IFN_LAST if we should open-code it.
5903    VECTYPE_IN is the type of the vector input.  MASKS specifies the masks
5904    that should be used to control the operation in a fully-masked loop.  */
5905
5906 static bool
5907 vectorize_fold_left_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
5908                                gimple **vec_stmt, slp_tree slp_node,
5909                                gimple *reduc_def_stmt,
5910                                tree_code code, internal_fn reduc_fn,
5911                                tree ops[3], tree vectype_in,
5912                                int reduc_index, vec_loop_masks *masks)
5913 {
5914   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5915   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5916   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5917   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5918   gimple *new_stmt = NULL;
5919
5920   int ncopies;
5921   if (slp_node)
5922     ncopies = 1;
5923   else
5924     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
5925
5926   gcc_assert (!nested_in_vect_loop_p (loop, stmt));
5927   gcc_assert (ncopies == 1);
5928   gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
5929   gcc_assert (reduc_index == (code == MINUS_EXPR ? 0 : 1));
5930   gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5931               == FOLD_LEFT_REDUCTION);
5932
5933   if (slp_node)
5934     gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
5935                           TYPE_VECTOR_SUBPARTS (vectype_in)));
5936
5937   tree op0 = ops[1 - reduc_index];
5938
5939   int group_size = 1;
5940   gimple *scalar_dest_def;
5941   auto_vec<tree> vec_oprnds0;
5942   if (slp_node)
5943     {
5944       vect_get_vec_defs (op0, NULL_TREE, stmt, &vec_oprnds0, NULL, slp_node);
5945       group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
5946       scalar_dest_def = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5947     }
5948   else
5949     {
5950       tree loop_vec_def0 = vect_get_vec_def_for_operand (op0, stmt);
5951       vec_oprnds0.create (1);
5952       vec_oprnds0.quick_push (loop_vec_def0);
5953       scalar_dest_def = stmt;
5954     }
5955
5956   tree scalar_dest = gimple_assign_lhs (scalar_dest_def);
5957   tree scalar_type = TREE_TYPE (scalar_dest);
5958   tree reduc_var = gimple_phi_result (reduc_def_stmt);
5959
5960   int vec_num = vec_oprnds0.length ();
5961   gcc_assert (vec_num == 1 || slp_node);
5962   tree vec_elem_type = TREE_TYPE (vectype_out);
5963   gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
5964
5965   tree vector_identity = NULL_TREE;
5966   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5967     vector_identity = build_zero_cst (vectype_out);
5968
5969   tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
5970   int i;
5971   tree def0;
5972   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5973     {
5974       tree mask = NULL_TREE;
5975       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5976         mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
5977
5978       /* Handle MINUS by adding the negative.  */
5979       if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
5980         {
5981           tree negated = make_ssa_name (vectype_out);
5982           new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
5983           gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5984           def0 = negated;
5985         }
5986
5987       if (mask)
5988         def0 = merge_with_identity (gsi, mask, vectype_out, def0,
5989                                     vector_identity);
5990
5991       /* On the first iteration the input is simply the scalar phi
5992          result, and for subsequent iterations it is the output of
5993          the preceding operation.  */
5994       if (reduc_fn != IFN_LAST)
5995         {
5996           new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var, def0);
5997           /* For chained SLP reductions the output of the previous reduction
5998              operation serves as the input of the next. For the final statement
5999              the output cannot be a temporary - we reuse the original
6000              scalar destination of the last statement.  */
6001           if (i != vec_num - 1)
6002             {
6003               gimple_set_lhs (new_stmt, scalar_dest_var);
6004               reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
6005               gimple_set_lhs (new_stmt, reduc_var);
6006             }
6007         }
6008       else
6009         {
6010           reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
6011                                              reduc_var, def0);
6012           new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6013           /* Remove the statement, so that we can use the same code paths
6014              as for statements that we've just created.  */
6015           gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6016           gsi_remove (&tmp_gsi, false);
6017         }
6018
6019       if (i == vec_num - 1)
6020         {
6021           gimple_set_lhs (new_stmt, scalar_dest);
6022           vect_finish_replace_stmt (scalar_dest_def, new_stmt);
6023         }
6024       else
6025         vect_finish_stmt_generation (scalar_dest_def, new_stmt, gsi);
6026
6027       if (slp_node)
6028         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6029     }
6030
6031   if (!slp_node)
6032     STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
6033
6034   return true;
6035 }
6036
6037 /* Function is_nonwrapping_integer_induction.
6038
6039    Check if STMT (which is part of loop LOOP) both increments and
6040    does not cause overflow.  */
6041
6042 static bool
6043 is_nonwrapping_integer_induction (gimple *stmt, struct loop *loop)
6044 {
6045   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
6046   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
6047   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
6048   tree lhs_type = TREE_TYPE (gimple_phi_result (stmt));
6049   widest_int ni, max_loop_value, lhs_max;
6050   bool overflow = false;
6051
6052   /* Make sure the loop is integer based.  */
6053   if (TREE_CODE (base) != INTEGER_CST
6054       || TREE_CODE (step) != INTEGER_CST)
6055     return false;
6056
6057   /* Check that the max size of the loop will not wrap.  */
6058
6059   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6060     return true;
6061
6062   if (! max_stmt_executions (loop, &ni))
6063     return false;
6064
6065   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6066                             &overflow);
6067   if (overflow)
6068     return false;
6069
6070   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6071                             TYPE_SIGN (lhs_type), &overflow);
6072   if (overflow)
6073     return false;
6074
6075   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6076           <= TYPE_PRECISION (lhs_type));
6077 }
6078
6079 /* Function vectorizable_reduction.
6080
6081    Check if STMT performs a reduction operation that can be vectorized.
6082    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
6083    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6084    Return FALSE if not a vectorizable STMT, TRUE otherwise.
6085
6086    This function also handles reduction idioms (patterns) that have been
6087    recognized in advance during vect_pattern_recog.  In this case, STMT may be
6088    of this form:
6089      X = pattern_expr (arg0, arg1, ..., X)
6090    and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
6091    sequence that had been detected and replaced by the pattern-stmt (STMT).
6092
6093    This function also handles reduction of condition expressions, for example:
6094      for (int i = 0; i < N; i++)
6095        if (a[i] < value)
6096          last = a[i];
6097    This is handled by vectorising the loop and creating an additional vector
6098    containing the loop indexes for which "a[i] < value" was true.  In the
6099    function epilogue this is reduced to a single max value and then used to
6100    index into the vector of results.
6101
6102    In some cases of reduction patterns, the type of the reduction variable X is
6103    different than the type of the other arguments of STMT.
6104    In such cases, the vectype that is used when transforming STMT into a vector
6105    stmt is different than the vectype that is used to determine the
6106    vectorization factor, because it consists of a different number of elements
6107    than the actual number of elements that are being operated upon in parallel.
6108
6109    For example, consider an accumulation of shorts into an int accumulator.
6110    On some targets it's possible to vectorize this pattern operating on 8
6111    shorts at a time (hence, the vectype for purposes of determining the
6112    vectorization factor should be V8HI); on the other hand, the vectype that
6113    is used to create the vector form is actually V4SI (the type of the result).
6114
6115    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6116    indicates what is the actual level of parallelism (V8HI in the example), so
6117    that the right vectorization factor would be derived.  This vectype
6118    corresponds to the type of arguments to the reduction stmt, and should *NOT*
6119    be used to create the vectorized stmt.  The right vectype for the vectorized
6120    stmt is obtained from the type of the result X:
6121         get_vectype_for_scalar_type (TREE_TYPE (X))
6122
6123    This means that, contrary to "regular" reductions (or "regular" stmts in
6124    general), the following equation:
6125       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
6126    does *NOT* necessarily hold for reduction patterns.  */
6127
6128 bool
6129 vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
6130                         gimple **vec_stmt, slp_tree slp_node,
6131                         slp_instance slp_node_instance,
6132                         stmt_vector_for_cost *cost_vec)
6133 {
6134   tree vec_dest;
6135   tree scalar_dest;
6136   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6137   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6138   tree vectype_in = NULL_TREE;
6139   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6140   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6141   enum tree_code code, orig_code;
6142   internal_fn reduc_fn;
6143   machine_mode vec_mode;
6144   int op_type;
6145   optab optab;
6146   tree new_temp = NULL_TREE;
6147   gimple *def_stmt;
6148   enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type;
6149   gimple *cond_reduc_def_stmt = NULL;
6150   enum tree_code cond_reduc_op_code = ERROR_MARK;
6151   tree scalar_type;
6152   bool is_simple_use;
6153   gimple *orig_stmt;
6154   stmt_vec_info orig_stmt_info = NULL;
6155   int i;
6156   int ncopies;
6157   int epilog_copies;
6158   stmt_vec_info prev_stmt_info, prev_phi_info;
6159   bool single_defuse_cycle = false;
6160   gimple *new_stmt = NULL;
6161   int j;
6162   tree ops[3];
6163   enum vect_def_type dts[3];
6164   bool nested_cycle = false, found_nested_cycle_def = false;
6165   bool double_reduc = false;
6166   basic_block def_bb;
6167   struct loop * def_stmt_loop, *outer_loop = NULL;
6168   tree def_arg;
6169   gimple *def_arg_stmt;
6170   auto_vec<tree> vec_oprnds0;
6171   auto_vec<tree> vec_oprnds1;
6172   auto_vec<tree> vec_oprnds2;
6173   auto_vec<tree> vect_defs;
6174   auto_vec<gimple *> phis;
6175   int vec_num;
6176   tree def0, tem;
6177   bool first_p = true;
6178   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6179   tree cond_reduc_val = NULL_TREE;
6180
6181   /* Make sure it was already recognized as a reduction computation.  */
6182   if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_reduction_def
6183       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_nested_cycle)
6184     return false;
6185
6186   if (nested_in_vect_loop_p (loop, stmt))
6187     {
6188       outer_loop = loop;
6189       loop = loop->inner;
6190       nested_cycle = true;
6191     }
6192
6193   /* In case of reduction chain we switch to the first stmt in the chain, but
6194      we don't update STMT_INFO, since only the last stmt is marked as reduction
6195      and has reduction properties.  */
6196   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6197       && REDUC_GROUP_FIRST_ELEMENT (stmt_info) != stmt)
6198     {
6199       stmt = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
6200       first_p = false;
6201     }
6202
6203   if (gimple_code (stmt) == GIMPLE_PHI)
6204     {
6205       /* Analysis is fully done on the reduction stmt invocation.  */
6206       if (! vec_stmt)
6207         {
6208           if (slp_node)
6209             slp_node_instance->reduc_phis = slp_node;
6210
6211           STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6212           return true;
6213         }
6214
6215       if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6216         /* Leave the scalar phi in place.  Note that checking
6217            STMT_VINFO_VEC_REDUCTION_TYPE (as below) only works
6218            for reductions involving a single statement.  */
6219         return true;
6220
6221       gimple *reduc_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
6222       if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (reduc_stmt)))
6223         reduc_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (reduc_stmt));
6224
6225       if (STMT_VINFO_VEC_REDUCTION_TYPE (vinfo_for_stmt (reduc_stmt))
6226           == EXTRACT_LAST_REDUCTION)
6227         /* Leave the scalar phi in place.  */
6228         return true;
6229
6230       gcc_assert (is_gimple_assign (reduc_stmt));
6231       for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k)
6232         {
6233           tree op = gimple_op (reduc_stmt, k);
6234           if (op == gimple_phi_result (stmt))
6235             continue;
6236           if (k == 1
6237               && gimple_assign_rhs_code (reduc_stmt) == COND_EXPR)
6238             continue;
6239           if (!vectype_in
6240               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6241                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (op)))))
6242             vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op));
6243           break;
6244         }
6245       gcc_assert (vectype_in);
6246
6247       if (slp_node)
6248         ncopies = 1;
6249       else
6250         ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6251
6252       use_operand_p use_p;
6253       gimple *use_stmt;
6254       if (ncopies > 1
6255           && (STMT_VINFO_RELEVANT (vinfo_for_stmt (reduc_stmt))
6256               <= vect_used_only_live)
6257           && single_imm_use (gimple_phi_result (stmt), &use_p, &use_stmt)
6258           && (use_stmt == reduc_stmt
6259               || (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt))
6260                   == reduc_stmt)))
6261         single_defuse_cycle = true;
6262
6263       /* Create the destination vector  */
6264       scalar_dest = gimple_assign_lhs (reduc_stmt);
6265       vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6266
6267       if (slp_node)
6268         /* The size vect_schedule_slp_instance computes is off for us.  */
6269         vec_num = vect_get_num_vectors
6270           (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6271            * SLP_TREE_SCALAR_STMTS (slp_node).length (),
6272            vectype_in);
6273       else
6274         vec_num = 1;
6275
6276       /* Generate the reduction PHIs upfront.  */
6277       prev_phi_info = NULL;
6278       for (j = 0; j < ncopies; j++)
6279         {
6280           if (j == 0 || !single_defuse_cycle)
6281             {
6282               for (i = 0; i < vec_num; i++)
6283                 {
6284                   /* Create the reduction-phi that defines the reduction
6285                      operand.  */
6286                   gimple *new_phi = create_phi_node (vec_dest, loop->header);
6287                   set_vinfo_for_stmt (new_phi,
6288                                       new_stmt_vec_info (new_phi, loop_vinfo));
6289
6290                   if (slp_node)
6291                     SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
6292                   else
6293                     {
6294                       if (j == 0)
6295                         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi;
6296                       else
6297                         STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
6298                       prev_phi_info = vinfo_for_stmt (new_phi);
6299                     }
6300                 }
6301             }
6302         }
6303
6304       return true;
6305     }
6306
6307   /* 1. Is vectorizable reduction?  */
6308   /* Not supportable if the reduction variable is used in the loop, unless
6309      it's a reduction chain.  */
6310   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6311       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6312     return false;
6313
6314   /* Reductions that are not used even in an enclosing outer-loop,
6315      are expected to be "live" (used out of the loop).  */
6316   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6317       && !STMT_VINFO_LIVE_P (stmt_info))
6318     return false;
6319
6320   /* 2. Has this been recognized as a reduction pattern?
6321
6322      Check if STMT represents a pattern that has been recognized
6323      in earlier analysis stages.  For stmts that represent a pattern,
6324      the STMT_VINFO_RELATED_STMT field records the last stmt in
6325      the original sequence that constitutes the pattern.  */
6326
6327   orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
6328   if (orig_stmt)
6329     {
6330       orig_stmt_info = vinfo_for_stmt (orig_stmt);
6331       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6332       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6333     }
6334
6335   /* 3. Check the operands of the operation.  The first operands are defined
6336         inside the loop body. The last operand is the reduction variable,
6337         which is defined by the loop-header-phi.  */
6338
6339   gcc_assert (is_gimple_assign (stmt));
6340
6341   /* Flatten RHS.  */
6342   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
6343     {
6344     case GIMPLE_BINARY_RHS:
6345       code = gimple_assign_rhs_code (stmt);
6346       op_type = TREE_CODE_LENGTH (code);
6347       gcc_assert (op_type == binary_op);
6348       ops[0] = gimple_assign_rhs1 (stmt);
6349       ops[1] = gimple_assign_rhs2 (stmt);
6350       break;
6351
6352     case GIMPLE_TERNARY_RHS:
6353       code = gimple_assign_rhs_code (stmt);
6354       op_type = TREE_CODE_LENGTH (code);
6355       gcc_assert (op_type == ternary_op);
6356       ops[0] = gimple_assign_rhs1 (stmt);
6357       ops[1] = gimple_assign_rhs2 (stmt);
6358       ops[2] = gimple_assign_rhs3 (stmt);
6359       break;
6360
6361     case GIMPLE_UNARY_RHS:
6362       return false;
6363
6364     default:
6365       gcc_unreachable ();
6366     }
6367
6368   if (code == COND_EXPR && slp_node)
6369     return false;
6370
6371   scalar_dest = gimple_assign_lhs (stmt);
6372   scalar_type = TREE_TYPE (scalar_dest);
6373   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6374       && !SCALAR_FLOAT_TYPE_P (scalar_type))
6375     return false;
6376
6377   /* Do not try to vectorize bit-precision reductions.  */
6378   if (!type_has_mode_precision_p (scalar_type))
6379     return false;
6380
6381   /* All uses but the last are expected to be defined in the loop.
6382      The last use is the reduction variable.  In case of nested cycle this
6383      assumption is not true: we use reduc_index to record the index of the
6384      reduction variable.  */
6385   gimple *reduc_def_stmt = NULL;
6386   int reduc_index = -1;
6387   for (i = 0; i < op_type; i++)
6388     {
6389       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
6390       if (i == 0 && code == COND_EXPR)
6391         continue;
6392
6393       is_simple_use = vect_is_simple_use (ops[i], loop_vinfo,
6394                                           &def_stmt, &dts[i], &tem);
6395       dt = dts[i];
6396       gcc_assert (is_simple_use);
6397       if (dt == vect_reduction_def)
6398         {
6399           reduc_def_stmt = def_stmt;
6400           reduc_index = i;
6401           continue;
6402         }
6403       else if (tem)
6404         {
6405           /* To properly compute ncopies we are interested in the widest
6406              input type in case we're looking at a widening accumulation.  */
6407           if (!vectype_in
6408               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6409                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem)))))
6410             vectype_in = tem;
6411         }
6412
6413       if (dt != vect_internal_def
6414           && dt != vect_external_def
6415           && dt != vect_constant_def
6416           && dt != vect_induction_def
6417           && !(dt == vect_nested_cycle && nested_cycle))
6418         return false;
6419
6420       if (dt == vect_nested_cycle)
6421         {
6422           found_nested_cycle_def = true;
6423           reduc_def_stmt = def_stmt;
6424           reduc_index = i;
6425         }
6426
6427       if (i == 1 && code == COND_EXPR)
6428         {
6429           /* Record how value of COND_EXPR is defined.  */
6430           if (dt == vect_constant_def)
6431             {
6432               cond_reduc_dt = dt;
6433               cond_reduc_val = ops[i];
6434             }
6435           if (dt == vect_induction_def
6436               && def_stmt != NULL
6437               && is_nonwrapping_integer_induction (def_stmt, loop))
6438             {
6439               cond_reduc_dt = dt;
6440               cond_reduc_def_stmt = def_stmt;
6441             }
6442         }
6443     }
6444
6445   if (!vectype_in)
6446     vectype_in = vectype_out;
6447
6448   /* When vectorizing a reduction chain w/o SLP the reduction PHI is not
6449      directy used in stmt.  */
6450   if (reduc_index == -1)
6451     {
6452       if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6453         {
6454           if (dump_enabled_p ())
6455             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6456                              "in-order reduction chain without SLP.\n");
6457           return false;
6458         }
6459
6460       if (orig_stmt)
6461         reduc_def_stmt = STMT_VINFO_REDUC_DEF (orig_stmt_info);
6462       else
6463         reduc_def_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
6464     }
6465
6466   if (! reduc_def_stmt || gimple_code (reduc_def_stmt) != GIMPLE_PHI)
6467     return false;
6468
6469   if (!(reduc_index == -1
6470         || dts[reduc_index] == vect_reduction_def
6471         || dts[reduc_index] == vect_nested_cycle
6472         || ((dts[reduc_index] == vect_internal_def
6473              || dts[reduc_index] == vect_external_def
6474              || dts[reduc_index] == vect_constant_def
6475              || dts[reduc_index] == vect_induction_def)
6476             && nested_cycle && found_nested_cycle_def)))
6477     {
6478       /* For pattern recognized stmts, orig_stmt might be a reduction,
6479          but some helper statements for the pattern might not, or
6480          might be COND_EXPRs with reduction uses in the condition.  */
6481       gcc_assert (orig_stmt);
6482       return false;
6483     }
6484
6485   stmt_vec_info reduc_def_info = vinfo_for_stmt (reduc_def_stmt);
6486   enum vect_reduction_type v_reduc_type
6487     = STMT_VINFO_REDUC_TYPE (reduc_def_info);
6488   gimple *tmp = STMT_VINFO_REDUC_DEF (reduc_def_info);
6489
6490   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
6491   /* If we have a condition reduction, see if we can simplify it further.  */
6492   if (v_reduc_type == COND_REDUCTION)
6493     {
6494       /* TODO: We can't yet handle reduction chains, since we need to treat
6495          each COND_EXPR in the chain specially, not just the last one.
6496          E.g. for:
6497
6498             x_1 = PHI <x_3, ...>
6499             x_2 = a_2 ? ... : x_1;
6500             x_3 = a_3 ? ... : x_2;
6501
6502          we're interested in the last element in x_3 for which a_2 || a_3
6503          is true, whereas the current reduction chain handling would
6504          vectorize x_2 as a normal VEC_COND_EXPR and only treat x_3
6505          as a reduction operation.  */
6506       if (reduc_index == -1)
6507         {
6508           if (dump_enabled_p ())
6509             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6510                              "conditional reduction chains not supported\n");
6511           return false;
6512         }
6513
6514       /* vect_is_simple_reduction ensured that operand 2 is the
6515          loop-carried operand.  */
6516       gcc_assert (reduc_index == 2);
6517
6518       /* Loop peeling modifies initial value of reduction PHI, which
6519          makes the reduction stmt to be transformed different to the
6520          original stmt analyzed.  We need to record reduction code for
6521          CONST_COND_REDUCTION type reduction at analyzing stage, thus
6522          it can be used directly at transform stage.  */
6523       if (STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MAX_EXPR
6524           || STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MIN_EXPR)
6525         {
6526           /* Also set the reduction type to CONST_COND_REDUCTION.  */
6527           gcc_assert (cond_reduc_dt == vect_constant_def);
6528           STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = CONST_COND_REDUCTION;
6529         }
6530       else if (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6531                                                vectype_in, OPTIMIZE_FOR_SPEED))
6532         {
6533           if (dump_enabled_p ())
6534             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6535                              "optimizing condition reduction with"
6536                              " FOLD_EXTRACT_LAST.\n");
6537           STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = EXTRACT_LAST_REDUCTION;
6538         }
6539       else if (cond_reduc_dt == vect_induction_def)
6540         {
6541           stmt_vec_info cond_stmt_vinfo = vinfo_for_stmt (cond_reduc_def_stmt);
6542           tree base
6543             = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6544           tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6545
6546           gcc_assert (TREE_CODE (base) == INTEGER_CST
6547                       && TREE_CODE (step) == INTEGER_CST);
6548           cond_reduc_val = NULL_TREE;
6549           /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6550              above base; punt if base is the minimum value of the type for
6551              MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
6552           if (tree_int_cst_sgn (step) == -1)
6553             {
6554               cond_reduc_op_code = MIN_EXPR;
6555               if (tree_int_cst_sgn (base) == -1)
6556                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6557               else if (tree_int_cst_lt (base,
6558                                         TYPE_MAX_VALUE (TREE_TYPE (base))))
6559                 cond_reduc_val
6560                   = int_const_binop (PLUS_EXPR, base, integer_one_node);
6561             }
6562           else
6563             {
6564               cond_reduc_op_code = MAX_EXPR;
6565               if (tree_int_cst_sgn (base) == 1)
6566                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6567               else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6568                                         base))
6569                 cond_reduc_val
6570                   = int_const_binop (MINUS_EXPR, base, integer_one_node);
6571             }
6572           if (cond_reduc_val)
6573             {
6574               if (dump_enabled_p ())
6575                 dump_printf_loc (MSG_NOTE, vect_location,
6576                                  "condition expression based on "
6577                                  "integer induction.\n");
6578               STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6579                 = INTEGER_INDUC_COND_REDUCTION;
6580             }
6581         }
6582       else if (cond_reduc_dt == vect_constant_def)
6583         {
6584           enum vect_def_type cond_initial_dt;
6585           gimple *def_stmt = SSA_NAME_DEF_STMT (ops[reduc_index]);
6586           tree cond_initial_val
6587             = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
6588
6589           gcc_assert (cond_reduc_val != NULL_TREE);
6590           vect_is_simple_use (cond_initial_val, loop_vinfo,
6591                               &def_stmt, &cond_initial_dt);
6592           if (cond_initial_dt == vect_constant_def
6593               && types_compatible_p (TREE_TYPE (cond_initial_val),
6594                                      TREE_TYPE (cond_reduc_val)))
6595             {
6596               tree e = fold_binary (LE_EXPR, boolean_type_node,
6597                                     cond_initial_val, cond_reduc_val);
6598               if (e && (integer_onep (e) || integer_zerop (e)))
6599                 {
6600                   if (dump_enabled_p ())
6601                     dump_printf_loc (MSG_NOTE, vect_location,
6602                                      "condition expression based on "
6603                                      "compile time constant.\n");
6604                   /* Record reduction code at analysis stage.  */
6605                   STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info)
6606                     = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6607                   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6608                     = CONST_COND_REDUCTION;
6609                 }
6610             }
6611         }
6612     }
6613
6614   if (orig_stmt)
6615     gcc_assert (tmp == orig_stmt
6616                 || (REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp))
6617                     == orig_stmt));
6618   else
6619     /* We changed STMT to be the first stmt in reduction chain, hence we
6620        check that in this case the first element in the chain is STMT.  */
6621     gcc_assert (stmt == tmp
6622                 || REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
6623
6624   if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
6625     return false;
6626
6627   if (slp_node)
6628     ncopies = 1;
6629   else
6630     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6631
6632   gcc_assert (ncopies >= 1);
6633
6634   vec_mode = TYPE_MODE (vectype_in);
6635   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6636
6637   if (code == COND_EXPR)
6638     {
6639       /* Only call during the analysis stage, otherwise we'll lose
6640          STMT_VINFO_TYPE.  */
6641       if (!vec_stmt && !vectorizable_condition (stmt, gsi, NULL,
6642                                                 ops[reduc_index], 0, NULL,
6643                                                 cost_vec))
6644         {
6645           if (dump_enabled_p ())
6646             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6647                              "unsupported condition in reduction\n");
6648           return false;
6649         }
6650     }
6651   else
6652     {
6653       /* 4. Supportable by target?  */
6654
6655       if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
6656           || code == LROTATE_EXPR || code == RROTATE_EXPR)
6657         {
6658           /* Shifts and rotates are only supported by vectorizable_shifts,
6659              not vectorizable_reduction.  */
6660           if (dump_enabled_p ())
6661             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6662                              "unsupported shift or rotation.\n");
6663           return false;
6664         }
6665
6666       /* 4.1. check support for the operation in the loop  */
6667       optab = optab_for_tree_code (code, vectype_in, optab_default);
6668       if (!optab)
6669         {
6670           if (dump_enabled_p ())
6671             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6672                              "no optab.\n");
6673
6674           return false;
6675         }
6676
6677       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6678         {
6679           if (dump_enabled_p ())
6680             dump_printf (MSG_NOTE, "op not supported by target.\n");
6681
6682           if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
6683               || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6684             return false;
6685
6686           if (dump_enabled_p ())
6687             dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6688         }
6689
6690       /* Worthwhile without SIMD support?  */
6691       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
6692           && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6693         {
6694           if (dump_enabled_p ())
6695             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6696                              "not worthwhile without SIMD support.\n");
6697
6698           return false;
6699         }
6700     }
6701
6702   /* 4.2. Check support for the epilog operation.
6703
6704           If STMT represents a reduction pattern, then the type of the
6705           reduction variable may be different than the type of the rest
6706           of the arguments.  For example, consider the case of accumulation
6707           of shorts into an int accumulator; The original code:
6708                         S1: int_a = (int) short_a;
6709           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
6710
6711           was replaced with:
6712                         STMT: int_acc = widen_sum <short_a, int_acc>
6713
6714           This means that:
6715           1. The tree-code that is used to create the vector operation in the
6716              epilog code (that reduces the partial results) is not the
6717              tree-code of STMT, but is rather the tree-code of the original
6718              stmt from the pattern that STMT is replacing.  I.e, in the example
6719              above we want to use 'widen_sum' in the loop, but 'plus' in the
6720              epilog.
6721           2. The type (mode) we use to check available target support
6722              for the vector operation to be created in the *epilog*, is
6723              determined by the type of the reduction variable (in the example
6724              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6725              However the type (mode) we use to check available target support
6726              for the vector operation to be created *inside the loop*, is
6727              determined by the type of the other arguments to STMT (in the
6728              example we'd check this: optab_handler (widen_sum_optab,
6729              vect_short_mode)).
6730
6731           This is contrary to "regular" reductions, in which the types of all
6732           the arguments are the same as the type of the reduction variable.
6733           For "regular" reductions we can therefore use the same vector type
6734           (and also the same tree-code) when generating the epilog code and
6735           when generating the code inside the loop.  */
6736
6737   vect_reduction_type reduction_type
6738     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
6739   if (orig_stmt
6740       && (reduction_type == TREE_CODE_REDUCTION
6741           || reduction_type == FOLD_LEFT_REDUCTION))
6742     {
6743       /* This is a reduction pattern: get the vectype from the type of the
6744          reduction variable, and get the tree-code from orig_stmt.  */
6745       orig_code = gimple_assign_rhs_code (orig_stmt);
6746       gcc_assert (vectype_out);
6747       vec_mode = TYPE_MODE (vectype_out);
6748     }
6749   else
6750     {
6751       /* Regular reduction: use the same vectype and tree-code as used for
6752          the vector code inside the loop can be used for the epilog code. */
6753       orig_code = code;
6754
6755       if (code == MINUS_EXPR)
6756         orig_code = PLUS_EXPR;
6757
6758       /* For simple condition reductions, replace with the actual expression
6759          we want to base our reduction around.  */
6760       if (reduction_type == CONST_COND_REDUCTION)
6761         {
6762           orig_code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
6763           gcc_assert (orig_code == MAX_EXPR || orig_code == MIN_EXPR);
6764         }
6765       else if (reduction_type == INTEGER_INDUC_COND_REDUCTION)
6766         orig_code = cond_reduc_op_code;
6767     }
6768
6769   if (nested_cycle)
6770     {
6771       def_bb = gimple_bb (reduc_def_stmt);
6772       def_stmt_loop = def_bb->loop_father;
6773       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
6774                                        loop_preheader_edge (def_stmt_loop));
6775       if (TREE_CODE (def_arg) == SSA_NAME
6776           && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
6777           && gimple_code (def_arg_stmt) == GIMPLE_PHI
6778           && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
6779           && vinfo_for_stmt (def_arg_stmt)
6780           && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
6781               == vect_double_reduction_def)
6782         double_reduc = true;
6783     }
6784
6785   reduc_fn = IFN_LAST;
6786
6787   if (reduction_type == TREE_CODE_REDUCTION
6788       || reduction_type == FOLD_LEFT_REDUCTION
6789       || reduction_type == INTEGER_INDUC_COND_REDUCTION
6790       || reduction_type == CONST_COND_REDUCTION)
6791     {
6792       if (reduction_type == FOLD_LEFT_REDUCTION
6793           ? fold_left_reduction_fn (orig_code, &reduc_fn)
6794           : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6795         {
6796           if (reduc_fn != IFN_LAST
6797               && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6798                                                   OPTIMIZE_FOR_SPEED))
6799             {
6800               if (dump_enabled_p ())
6801                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6802                                  "reduc op not supported by target.\n");
6803
6804               reduc_fn = IFN_LAST;
6805             }
6806         }
6807       else
6808         {
6809           if (!nested_cycle || double_reduc)
6810             {
6811               if (dump_enabled_p ())
6812                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6813                                  "no reduc code for scalar code.\n");
6814
6815               return false;
6816             }
6817         }
6818     }
6819   else if (reduction_type == COND_REDUCTION)
6820     {
6821       int scalar_precision
6822         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6823       cr_index_scalar_type = make_unsigned_type (scalar_precision);
6824       cr_index_vector_type = build_vector_type (cr_index_scalar_type,
6825                                                 nunits_out);
6826
6827       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
6828                                           OPTIMIZE_FOR_SPEED))
6829         reduc_fn = IFN_REDUC_MAX;
6830     }
6831
6832   if (reduction_type != EXTRACT_LAST_REDUCTION
6833       && reduc_fn == IFN_LAST
6834       && !nunits_out.is_constant ())
6835     {
6836       if (dump_enabled_p ())
6837         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6838                          "missing target support for reduction on"
6839                          " variable-length vectors.\n");
6840       return false;
6841     }
6842
6843   if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
6844       && ncopies > 1)
6845     {
6846       if (dump_enabled_p ())
6847         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6848                          "multiple types in double reduction or condition "
6849                          "reduction.\n");
6850       return false;
6851     }
6852
6853   /* For SLP reductions, see if there is a neutral value we can use.  */
6854   tree neutral_op = NULL_TREE;
6855   if (slp_node)
6856     neutral_op = neutral_op_for_slp_reduction
6857                    (slp_node_instance->reduc_phis, code,
6858                     REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL);
6859
6860   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
6861     {
6862       /* We can't support in-order reductions of code such as this:
6863
6864            for (int i = 0; i < n1; ++i)
6865              for (int j = 0; j < n2; ++j)
6866                l += a[j];
6867
6868          since GCC effectively transforms the loop when vectorizing:
6869
6870            for (int i = 0; i < n1 / VF; ++i)
6871              for (int j = 0; j < n2; ++j)
6872                for (int k = 0; k < VF; ++k)
6873                  l += a[j];
6874
6875          which is a reassociation of the original operation.  */
6876       if (dump_enabled_p ())
6877         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6878                          "in-order double reduction not supported.\n");
6879
6880       return false;
6881     }
6882
6883   if (reduction_type == FOLD_LEFT_REDUCTION
6884       && slp_node
6885       && !REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
6886     {
6887       /* We cannot use in-order reductions in this case because there is
6888          an implicit reassociation of the operations involved.  */
6889       if (dump_enabled_p ())
6890         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6891                          "in-order unchained SLP reductions not supported.\n");
6892       return false;
6893     }
6894
6895   /* For double reductions, and for SLP reductions with a neutral value,
6896      we construct a variable-length initial vector by loading a vector
6897      full of the neutral value and then shift-and-inserting the start
6898      values into the low-numbered elements.  */
6899   if ((double_reduc || neutral_op)
6900       && !nunits_out.is_constant ()
6901       && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
6902                                           vectype_out, OPTIMIZE_FOR_SPEED))
6903     {
6904       if (dump_enabled_p ())
6905         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6906                          "reduction on variable-length vectors requires"
6907                          " target support for a vector-shift-and-insert"
6908                          " operation.\n");
6909       return false;
6910     }
6911
6912   /* Check extra constraints for variable-length unchained SLP reductions.  */
6913   if (STMT_SLP_TYPE (stmt_info)
6914       && !REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
6915       && !nunits_out.is_constant ())
6916     {
6917       /* We checked above that we could build the initial vector when
6918          there's a neutral element value.  Check here for the case in
6919          which each SLP statement has its own initial value and in which
6920          that value needs to be repeated for every instance of the
6921          statement within the initial vector.  */
6922       unsigned int group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6923       scalar_mode elt_mode = SCALAR_TYPE_MODE (TREE_TYPE (vectype_out));
6924       if (!neutral_op
6925           && !can_duplicate_and_interleave_p (group_size, elt_mode))
6926         {
6927           if (dump_enabled_p ())
6928             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6929                              "unsupported form of SLP reduction for"
6930                              " variable-length vectors: cannot build"
6931                              " initial vector.\n");
6932           return false;
6933         }
6934       /* The epilogue code relies on the number of elements being a multiple
6935          of the group size.  The duplicate-and-interleave approach to setting
6936          up the the initial vector does too.  */
6937       if (!multiple_p (nunits_out, group_size))
6938         {
6939           if (dump_enabled_p ())
6940             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6941                              "unsupported form of SLP reduction for"
6942                              " variable-length vectors: the vector size"
6943                              " is not a multiple of the number of results.\n");
6944           return false;
6945         }
6946     }
6947
6948   /* In case of widenning multiplication by a constant, we update the type
6949      of the constant to be the type of the other operand.  We check that the
6950      constant fits the type in the pattern recognition pass.  */
6951   if (code == DOT_PROD_EXPR
6952       && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
6953     {
6954       if (TREE_CODE (ops[0]) == INTEGER_CST)
6955         ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
6956       else if (TREE_CODE (ops[1]) == INTEGER_CST)
6957         ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
6958       else
6959         {
6960           if (dump_enabled_p ())
6961             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6962                              "invalid types in dot-prod\n");
6963
6964           return false;
6965         }
6966     }
6967
6968   if (reduction_type == COND_REDUCTION)
6969     {
6970       widest_int ni;
6971
6972       if (! max_loop_iterations (loop, &ni))
6973         {
6974           if (dump_enabled_p ())
6975             dump_printf_loc (MSG_NOTE, vect_location,
6976                              "loop count not known, cannot create cond "
6977                              "reduction.\n");
6978           return false;
6979         }
6980       /* Convert backedges to iterations.  */
6981       ni += 1;
6982
6983       /* The additional index will be the same type as the condition.  Check
6984          that the loop can fit into this less one (because we'll use up the
6985          zero slot for when there are no matches).  */
6986       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
6987       if (wi::geu_p (ni, wi::to_widest (max_index)))
6988         {
6989           if (dump_enabled_p ())
6990             dump_printf_loc (MSG_NOTE, vect_location,
6991                              "loop size is greater than data size.\n");
6992           return false;
6993         }
6994     }
6995
6996   /* In case the vectorization factor (VF) is bigger than the number
6997      of elements that we can fit in a vectype (nunits), we have to generate
6998      more than one vector stmt - i.e - we need to "unroll" the
6999      vector stmt by a factor VF/nunits.  For more details see documentation
7000      in vectorizable_operation.  */
7001
7002   /* If the reduction is used in an outer loop we need to generate
7003      VF intermediate results, like so (e.g. for ncopies=2):
7004         r0 = phi (init, r0)
7005         r1 = phi (init, r1)
7006         r0 = x0 + r0;
7007         r1 = x1 + r1;
7008     (i.e. we generate VF results in 2 registers).
7009     In this case we have a separate def-use cycle for each copy, and therefore
7010     for each copy we get the vector def for the reduction variable from the
7011     respective phi node created for this copy.
7012
7013     Otherwise (the reduction is unused in the loop nest), we can combine
7014     together intermediate results, like so (e.g. for ncopies=2):
7015         r = phi (init, r)
7016         r = x0 + r;
7017         r = x1 + r;
7018    (i.e. we generate VF/2 results in a single register).
7019    In this case for each copy we get the vector def for the reduction variable
7020    from the vectorized reduction operation generated in the previous iteration.
7021
7022    This only works when we see both the reduction PHI and its only consumer
7023    in vectorizable_reduction and there are no intermediate stmts
7024    participating.  */
7025   use_operand_p use_p;
7026   gimple *use_stmt;
7027   if (ncopies > 1
7028       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
7029       && single_imm_use (gimple_phi_result (reduc_def_stmt), &use_p, &use_stmt)
7030       && (use_stmt == stmt
7031           || STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt)) == stmt))
7032     {
7033       single_defuse_cycle = true;
7034       epilog_copies = 1;
7035     }
7036   else
7037     epilog_copies = ncopies;
7038
7039   /* If the reduction stmt is one of the patterns that have lane
7040      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
7041   if ((ncopies > 1
7042        && ! single_defuse_cycle)
7043       && (code == DOT_PROD_EXPR
7044           || code == WIDEN_SUM_EXPR
7045           || code == SAD_EXPR))
7046     {
7047       if (dump_enabled_p ())
7048         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7049                          "multi def-use cycle not possible for lane-reducing "
7050                          "reduction operation\n");
7051       return false;
7052     }
7053
7054   if (slp_node)
7055     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7056   else
7057     vec_num = 1;
7058
7059   internal_fn cond_fn = get_conditional_internal_fn (code);
7060   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7061
7062   if (!vec_stmt) /* transformation not required.  */
7063     {
7064       if (first_p)
7065         vect_model_reduction_cost (stmt_info, reduc_fn, ncopies, cost_vec);
7066       if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
7067         {
7068           if (reduction_type != FOLD_LEFT_REDUCTION
7069               && (cond_fn == IFN_LAST
7070                   || !direct_internal_fn_supported_p (cond_fn, vectype_in,
7071                                                       OPTIMIZE_FOR_SPEED)))
7072             {
7073               if (dump_enabled_p ())
7074                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7075                                  "can't use a fully-masked loop because no"
7076                                  " conditional operation is available.\n");
7077               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7078             }
7079           else if (reduc_index == -1)
7080             {
7081               if (dump_enabled_p ())
7082                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7083                                  "can't use a fully-masked loop for chained"
7084                                  " reductions.\n");
7085               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7086             }
7087           else
7088             vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
7089                                    vectype_in);
7090         }
7091       if (dump_enabled_p ()
7092           && reduction_type == FOLD_LEFT_REDUCTION)
7093         dump_printf_loc (MSG_NOTE, vect_location,
7094                          "using an in-order (fold-left) reduction.\n");
7095       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7096       return true;
7097     }
7098
7099   /* Transform.  */
7100
7101   if (dump_enabled_p ())
7102     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7103
7104   /* FORNOW: Multiple types are not supported for condition.  */
7105   if (code == COND_EXPR)
7106     gcc_assert (ncopies == 1);
7107
7108   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7109
7110   if (reduction_type == FOLD_LEFT_REDUCTION)
7111     return vectorize_fold_left_reduction
7112       (stmt, gsi, vec_stmt, slp_node, reduc_def_stmt, code,
7113        reduc_fn, ops, vectype_in, reduc_index, masks);
7114
7115   if (reduction_type == EXTRACT_LAST_REDUCTION)
7116     {
7117       gcc_assert (!slp_node);
7118       return vectorizable_condition (stmt, gsi, vec_stmt,
7119                                      NULL, reduc_index, NULL, NULL);
7120     }
7121
7122   /* Create the destination vector  */
7123   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7124
7125   prev_stmt_info = NULL;
7126   prev_phi_info = NULL;
7127   if (!slp_node)
7128     {
7129       vec_oprnds0.create (1);
7130       vec_oprnds1.create (1);
7131       if (op_type == ternary_op)
7132         vec_oprnds2.create (1);
7133     }
7134
7135   phis.create (vec_num);
7136   vect_defs.create (vec_num);
7137   if (!slp_node)
7138     vect_defs.quick_push (NULL_TREE);
7139
7140   if (slp_node)
7141     phis.splice (SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis));
7142   else
7143     phis.quick_push (STMT_VINFO_VEC_STMT (vinfo_for_stmt (reduc_def_stmt)));
7144
7145   for (j = 0; j < ncopies; j++)
7146     {
7147       if (code == COND_EXPR)
7148         {
7149           gcc_assert (!slp_node);
7150           vectorizable_condition (stmt, gsi, vec_stmt,
7151                                   PHI_RESULT (phis[0]),
7152                                   reduc_index, NULL, NULL);
7153           /* Multiple types are not supported for condition.  */
7154           break;
7155         }
7156
7157       /* Handle uses.  */
7158       if (j == 0)
7159         {
7160           if (slp_node)
7161             {
7162               /* Get vec defs for all the operands except the reduction index,
7163                  ensuring the ordering of the ops in the vector is kept.  */
7164               auto_vec<tree, 3> slp_ops;
7165               auto_vec<vec<tree>, 3> vec_defs;
7166
7167               slp_ops.quick_push (ops[0]);
7168               slp_ops.quick_push (ops[1]);
7169               if (op_type == ternary_op)
7170                 slp_ops.quick_push (ops[2]);
7171
7172               vect_get_slp_defs (slp_ops, slp_node, &vec_defs);
7173
7174               vec_oprnds0.safe_splice (vec_defs[0]);
7175               vec_defs[0].release ();
7176               vec_oprnds1.safe_splice (vec_defs[1]);
7177               vec_defs[1].release ();
7178               if (op_type == ternary_op)
7179                 {
7180                   vec_oprnds2.safe_splice (vec_defs[2]);
7181                   vec_defs[2].release ();
7182                 }
7183             }
7184           else
7185             {
7186               vec_oprnds0.quick_push
7187                 (vect_get_vec_def_for_operand (ops[0], stmt));
7188               vec_oprnds1.quick_push
7189                 (vect_get_vec_def_for_operand (ops[1], stmt));
7190               if (op_type == ternary_op)
7191                 vec_oprnds2.quick_push
7192                   (vect_get_vec_def_for_operand (ops[2], stmt));
7193             }
7194         }
7195       else
7196         {
7197           if (!slp_node)
7198             {
7199               gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
7200
7201               if (single_defuse_cycle && reduc_index == 0)
7202                 vec_oprnds0[0] = gimple_get_lhs (new_stmt);
7203               else
7204                 vec_oprnds0[0]
7205                   = vect_get_vec_def_for_stmt_copy (dts[0], vec_oprnds0[0]);
7206               if (single_defuse_cycle && reduc_index == 1)
7207                 vec_oprnds1[0] = gimple_get_lhs (new_stmt);
7208               else
7209                 vec_oprnds1[0]
7210                   = vect_get_vec_def_for_stmt_copy (dts[1], vec_oprnds1[0]);
7211               if (op_type == ternary_op)
7212                 {
7213                   if (single_defuse_cycle && reduc_index == 2)
7214                     vec_oprnds2[0] = gimple_get_lhs (new_stmt);
7215                   else
7216                     vec_oprnds2[0]
7217                       = vect_get_vec_def_for_stmt_copy (dts[2], vec_oprnds2[0]);
7218                 }
7219             }
7220         }
7221
7222       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7223         {
7224           tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7225           if (masked_loop_p)
7226             {
7227               /* Make sure that the reduction accumulator is vop[0].  */
7228               if (reduc_index == 1)
7229                 {
7230                   gcc_assert (commutative_tree_code (code));
7231                   std::swap (vop[0], vop[1]);
7232                 }
7233               tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7234                                               vectype_in, i * ncopies + j);
7235               gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
7236                                                         vop[0], vop[1],
7237                                                         vop[0]);
7238               new_temp = make_ssa_name (vec_dest, call);
7239               gimple_call_set_lhs (call, new_temp);
7240               gimple_call_set_nothrow (call, true);
7241               new_stmt = call;
7242             }
7243           else
7244             {
7245               if (op_type == ternary_op)
7246                 vop[2] = vec_oprnds2[i];
7247
7248               new_temp = make_ssa_name (vec_dest, new_stmt);
7249               new_stmt = gimple_build_assign (new_temp, code,
7250                                               vop[0], vop[1], vop[2]);
7251             }
7252           vect_finish_stmt_generation (stmt, new_stmt, gsi);
7253
7254           if (slp_node)
7255             {
7256               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7257               vect_defs.quick_push (new_temp);
7258             }
7259           else
7260             vect_defs[0] = new_temp;
7261         }
7262
7263       if (slp_node)
7264         continue;
7265
7266       if (j == 0)
7267         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
7268       else
7269         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
7270
7271       prev_stmt_info = vinfo_for_stmt (new_stmt);
7272     }
7273
7274   /* Finalize the reduction-phi (set its arguments) and create the
7275      epilog reduction code.  */
7276   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
7277     vect_defs[0] = gimple_get_lhs (*vec_stmt);
7278
7279   vect_create_epilog_for_reduction (vect_defs, stmt, reduc_def_stmt,
7280                                     epilog_copies, reduc_fn, phis,
7281                                     double_reduc, slp_node, slp_node_instance,
7282                                     cond_reduc_val, cond_reduc_op_code,
7283                                     neutral_op);
7284
7285   return true;
7286 }
7287
7288 /* Function vect_min_worthwhile_factor.
7289
7290    For a loop where we could vectorize the operation indicated by CODE,
7291    return the minimum vectorization factor that makes it worthwhile
7292    to use generic vectors.  */
7293 static unsigned int
7294 vect_min_worthwhile_factor (enum tree_code code)
7295 {
7296   switch (code)
7297     {
7298     case PLUS_EXPR:
7299     case MINUS_EXPR:
7300     case NEGATE_EXPR:
7301       return 4;
7302
7303     case BIT_AND_EXPR:
7304     case BIT_IOR_EXPR:
7305     case BIT_XOR_EXPR:
7306     case BIT_NOT_EXPR:
7307       return 2;
7308
7309     default:
7310       return INT_MAX;
7311     }
7312 }
7313
7314 /* Return true if VINFO indicates we are doing loop vectorization and if
7315    it is worth decomposing CODE operations into scalar operations for
7316    that loop's vectorization factor.  */
7317
7318 bool
7319 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
7320 {
7321   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7322   unsigned HOST_WIDE_INT value;
7323   return (loop_vinfo
7324           && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7325           && value >= vect_min_worthwhile_factor (code));
7326 }
7327
7328 /* Function vectorizable_induction
7329
7330    Check if PHI performs an induction computation that can be vectorized.
7331    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7332    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7333    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
7334
7335 bool
7336 vectorizable_induction (gimple *phi,
7337                         gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7338                         gimple **vec_stmt, slp_tree slp_node,
7339                         stmt_vector_for_cost *cost_vec)
7340 {
7341   stmt_vec_info stmt_info = vinfo_for_stmt (phi);
7342   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7343   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7344   unsigned ncopies;
7345   bool nested_in_vect_loop = false;
7346   struct loop *iv_loop;
7347   tree vec_def;
7348   edge pe = loop_preheader_edge (loop);
7349   basic_block new_bb;
7350   tree new_vec, vec_init, vec_step, t;
7351   tree new_name;
7352   gimple *new_stmt;
7353   gphi *induction_phi;
7354   tree induc_def, vec_dest;
7355   tree init_expr, step_expr;
7356   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7357   unsigned i;
7358   tree expr;
7359   gimple_seq stmts;
7360   imm_use_iterator imm_iter;
7361   use_operand_p use_p;
7362   gimple *exit_phi;
7363   edge latch_e;
7364   tree loop_arg;
7365   gimple_stmt_iterator si;
7366   basic_block bb = gimple_bb (phi);
7367
7368   if (gimple_code (phi) != GIMPLE_PHI)
7369     return false;
7370
7371   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7372     return false;
7373
7374   /* Make sure it was recognized as induction computation.  */
7375   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7376     return false;
7377
7378   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7379   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7380
7381   if (slp_node)
7382     ncopies = 1;
7383   else
7384     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7385   gcc_assert (ncopies >= 1);
7386
7387   /* FORNOW. These restrictions should be relaxed.  */
7388   if (nested_in_vect_loop_p (loop, phi))
7389     {
7390       imm_use_iterator imm_iter;
7391       use_operand_p use_p;
7392       gimple *exit_phi;
7393       edge latch_e;
7394       tree loop_arg;
7395
7396       if (ncopies > 1)
7397         {
7398           if (dump_enabled_p ())
7399             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7400                              "multiple types in nested loop.\n");
7401           return false;
7402         }
7403
7404       /* FORNOW: outer loop induction with SLP not supported.  */
7405       if (STMT_SLP_TYPE (stmt_info))
7406         return false;
7407
7408       exit_phi = NULL;
7409       latch_e = loop_latch_edge (loop->inner);
7410       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7411       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7412         {
7413           gimple *use_stmt = USE_STMT (use_p);
7414           if (is_gimple_debug (use_stmt))
7415             continue;
7416
7417           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7418             {
7419               exit_phi = use_stmt;
7420               break;
7421             }
7422         }
7423       if (exit_phi)
7424         {
7425           stmt_vec_info exit_phi_vinfo  = vinfo_for_stmt (exit_phi);
7426           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7427                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7428             {
7429               if (dump_enabled_p ())
7430                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7431                                  "inner-loop induction only used outside "
7432                                  "of the outer vectorized loop.\n");
7433               return false;
7434             }
7435         }
7436
7437       nested_in_vect_loop = true;
7438       iv_loop = loop->inner;
7439     }
7440   else
7441     iv_loop = loop;
7442   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7443
7444   if (slp_node && !nunits.is_constant ())
7445     {
7446       /* The current SLP code creates the initial value element-by-element.  */
7447       if (dump_enabled_p ())
7448         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7449                          "SLP induction not supported for variable-length"
7450                          " vectors.\n");
7451       return false;
7452     }
7453
7454   if (!vec_stmt) /* transformation not required.  */
7455     {
7456       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
7457       DUMP_VECT_SCOPE ("vectorizable_induction");
7458       vect_model_induction_cost (stmt_info, ncopies, cost_vec);
7459       return true;
7460     }
7461
7462   /* Transform.  */
7463
7464   /* Compute a vector variable, initialized with the first VF values of
7465      the induction variable.  E.g., for an iv with IV_PHI='X' and
7466      evolution S, for a vector of 4 units, we want to compute:
7467      [X, X + S, X + 2*S, X + 3*S].  */
7468
7469   if (dump_enabled_p ())
7470     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
7471
7472   latch_e = loop_latch_edge (iv_loop);
7473   loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7474
7475   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
7476   gcc_assert (step_expr != NULL_TREE);
7477
7478   pe = loop_preheader_edge (iv_loop);
7479   init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
7480                                      loop_preheader_edge (iv_loop));
7481
7482   stmts = NULL;
7483   if (!nested_in_vect_loop)
7484     {
7485       /* Convert the initial value to the desired type.  */
7486       tree new_type = TREE_TYPE (vectype);
7487       init_expr = gimple_convert (&stmts, new_type, init_expr);
7488
7489       /* If we are using the loop mask to "peel" for alignment then we need
7490          to adjust the start value here.  */
7491       tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
7492       if (skip_niters != NULL_TREE)
7493         {
7494           if (FLOAT_TYPE_P (vectype))
7495             skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
7496                                         skip_niters);
7497           else
7498             skip_niters = gimple_convert (&stmts, new_type, skip_niters);
7499           tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
7500                                          skip_niters, step_expr);
7501           init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
7502                                     init_expr, skip_step);
7503         }
7504     }
7505
7506   /* Convert the step to the desired type.  */
7507   step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
7508
7509   if (stmts)
7510     {
7511       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7512       gcc_assert (!new_bb);
7513     }
7514
7515   /* Find the first insertion point in the BB.  */
7516   si = gsi_after_labels (bb);
7517
7518   /* For SLP induction we have to generate several IVs as for example
7519      with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
7520      [i + 2*S, i + 3*S, i + 3*S, i + 3*S].  The step is the same uniform
7521      [VF*S, VF*S, VF*S, VF*S] for all.  */
7522   if (slp_node)
7523     {
7524       /* Enforced above.  */
7525       unsigned int const_nunits = nunits.to_constant ();
7526
7527       /* Generate [VF*S, VF*S, ... ].  */
7528       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7529         {
7530           expr = build_int_cst (integer_type_node, vf);
7531           expr = fold_convert (TREE_TYPE (step_expr), expr);
7532         }
7533       else
7534         expr = build_int_cst (TREE_TYPE (step_expr), vf);
7535       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7536                               expr, step_expr);
7537       if (! CONSTANT_CLASS_P (new_name))
7538         new_name = vect_init_vector (phi, new_name,
7539                                      TREE_TYPE (step_expr), NULL);
7540       new_vec = build_vector_from_val (vectype, new_name);
7541       vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7542
7543       /* Now generate the IVs.  */
7544       unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7545       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7546       unsigned elts = const_nunits * nvects;
7547       unsigned nivs = least_common_multiple (group_size,
7548                                              const_nunits) / const_nunits;
7549       gcc_assert (elts % group_size == 0);
7550       tree elt = init_expr;
7551       unsigned ivn;
7552       for (ivn = 0; ivn < nivs; ++ivn)
7553         {
7554           tree_vector_builder elts (vectype, const_nunits, 1);
7555           stmts = NULL;
7556           for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
7557             {
7558               if (ivn*const_nunits + eltn >= group_size
7559                   && (ivn * const_nunits + eltn) % group_size == 0)
7560                 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
7561                                     elt, step_expr);
7562               elts.quick_push (elt);
7563             }
7564           vec_init = gimple_build_vector (&stmts, &elts);
7565           if (stmts)
7566             {
7567               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7568               gcc_assert (!new_bb);
7569             }
7570
7571           /* Create the induction-phi that defines the induction-operand.  */
7572           vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7573           induction_phi = create_phi_node (vec_dest, iv_loop->header);
7574           set_vinfo_for_stmt (induction_phi,
7575                               new_stmt_vec_info (induction_phi, loop_vinfo));
7576           induc_def = PHI_RESULT (induction_phi);
7577
7578           /* Create the iv update inside the loop  */
7579           vec_def = make_ssa_name (vec_dest);
7580           new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7581           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7582           set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
7583
7584           /* Set the arguments of the phi node:  */
7585           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7586           add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7587                        UNKNOWN_LOCATION);
7588
7589           SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
7590         }
7591
7592       /* Re-use IVs when we can.  */
7593       if (ivn < nvects)
7594         {
7595           unsigned vfp
7596             = least_common_multiple (group_size, const_nunits) / group_size;
7597           /* Generate [VF'*S, VF'*S, ... ].  */
7598           if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7599             {
7600               expr = build_int_cst (integer_type_node, vfp);
7601               expr = fold_convert (TREE_TYPE (step_expr), expr);
7602             }
7603           else
7604             expr = build_int_cst (TREE_TYPE (step_expr), vfp);
7605           new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7606                                   expr, step_expr);
7607           if (! CONSTANT_CLASS_P (new_name))
7608             new_name = vect_init_vector (phi, new_name,
7609                                          TREE_TYPE (step_expr), NULL);
7610           new_vec = build_vector_from_val (vectype, new_name);
7611           vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7612           for (; ivn < nvects; ++ivn)
7613             {
7614               gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
7615               tree def;
7616               if (gimple_code (iv) == GIMPLE_PHI)
7617                 def = gimple_phi_result (iv);
7618               else
7619                 def = gimple_assign_lhs (iv);
7620               new_stmt = gimple_build_assign (make_ssa_name (vectype),
7621                                               PLUS_EXPR,
7622                                               def, vec_step);
7623               if (gimple_code (iv) == GIMPLE_PHI)
7624                 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7625               else
7626                 {
7627                   gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
7628                   gsi_insert_after (&tgsi, new_stmt, GSI_CONTINUE_LINKING);
7629                 }
7630               set_vinfo_for_stmt (new_stmt,
7631                                   new_stmt_vec_info (new_stmt, loop_vinfo));
7632               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7633             }
7634         }
7635
7636       return true;
7637     }
7638
7639   /* Create the vector that holds the initial_value of the induction.  */
7640   if (nested_in_vect_loop)
7641     {
7642       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
7643          been created during vectorization of previous stmts.  We obtain it
7644          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
7645       vec_init = vect_get_vec_def_for_operand (init_expr, phi);
7646       /* If the initial value is not of proper type, convert it.  */
7647       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
7648         {
7649           new_stmt
7650             = gimple_build_assign (vect_get_new_ssa_name (vectype,
7651                                                           vect_simple_var,
7652                                                           "vec_iv_"),
7653                                    VIEW_CONVERT_EXPR,
7654                                    build1 (VIEW_CONVERT_EXPR, vectype,
7655                                            vec_init));
7656           vec_init = gimple_assign_lhs (new_stmt);
7657           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
7658                                                  new_stmt);
7659           gcc_assert (!new_bb);
7660           set_vinfo_for_stmt (new_stmt,
7661                               new_stmt_vec_info (new_stmt, loop_vinfo));
7662         }
7663     }
7664   else
7665     {
7666       /* iv_loop is the loop to be vectorized. Create:
7667          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
7668       stmts = NULL;
7669       new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
7670
7671       unsigned HOST_WIDE_INT const_nunits;
7672       if (nunits.is_constant (&const_nunits))
7673         {
7674           tree_vector_builder elts (vectype, const_nunits, 1);
7675           elts.quick_push (new_name);
7676           for (i = 1; i < const_nunits; i++)
7677             {
7678               /* Create: new_name_i = new_name + step_expr  */
7679               new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
7680                                        new_name, step_expr);
7681               elts.quick_push (new_name);
7682             }
7683           /* Create a vector from [new_name_0, new_name_1, ...,
7684              new_name_nunits-1]  */
7685           vec_init = gimple_build_vector (&stmts, &elts);
7686         }
7687       else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
7688         /* Build the initial value directly from a VEC_SERIES_EXPR.  */
7689         vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, vectype,
7690                                  new_name, step_expr);
7691       else
7692         {
7693           /* Build:
7694                 [base, base, base, ...]
7695                 + (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
7696           gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
7697           gcc_assert (flag_associative_math);
7698           tree index = build_index_vector (vectype, 0, 1);
7699           tree base_vec = gimple_build_vector_from_val (&stmts, vectype,
7700                                                         new_name);
7701           tree step_vec = gimple_build_vector_from_val (&stmts, vectype,
7702                                                         step_expr);
7703           vec_init = gimple_build (&stmts, FLOAT_EXPR, vectype, index);
7704           vec_init = gimple_build (&stmts, MULT_EXPR, vectype,
7705                                    vec_init, step_vec);
7706           vec_init = gimple_build (&stmts, PLUS_EXPR, vectype,
7707                                    vec_init, base_vec);
7708         }
7709
7710       if (stmts)
7711         {
7712           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7713           gcc_assert (!new_bb);
7714         }
7715     }
7716
7717
7718   /* Create the vector that holds the step of the induction.  */
7719   if (nested_in_vect_loop)
7720     /* iv_loop is nested in the loop to be vectorized. Generate:
7721        vec_step = [S, S, S, S]  */
7722     new_name = step_expr;
7723   else
7724     {
7725       /* iv_loop is the loop to be vectorized. Generate:
7726           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
7727       gimple_seq seq = NULL;
7728       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7729         {
7730           expr = build_int_cst (integer_type_node, vf);
7731           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7732         }
7733       else
7734         expr = build_int_cst (TREE_TYPE (step_expr), vf);
7735       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7736                                expr, step_expr);
7737       if (seq)
7738         {
7739           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7740           gcc_assert (!new_bb);
7741         }
7742     }
7743
7744   t = unshare_expr (new_name);
7745   gcc_assert (CONSTANT_CLASS_P (new_name)
7746               || TREE_CODE (new_name) == SSA_NAME);
7747   new_vec = build_vector_from_val (vectype, t);
7748   vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7749
7750
7751   /* Create the following def-use cycle:
7752      loop prolog:
7753          vec_init = ...
7754          vec_step = ...
7755      loop:
7756          vec_iv = PHI <vec_init, vec_loop>
7757          ...
7758          STMT
7759          ...
7760          vec_loop = vec_iv + vec_step;  */
7761
7762   /* Create the induction-phi that defines the induction-operand.  */
7763   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7764   induction_phi = create_phi_node (vec_dest, iv_loop->header);
7765   set_vinfo_for_stmt (induction_phi,
7766                       new_stmt_vec_info (induction_phi, loop_vinfo));
7767   induc_def = PHI_RESULT (induction_phi);
7768
7769   /* Create the iv update inside the loop  */
7770   vec_def = make_ssa_name (vec_dest);
7771   new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7772   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7773   set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
7774
7775   /* Set the arguments of the phi node:  */
7776   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7777   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7778                UNKNOWN_LOCATION);
7779
7780   STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi;
7781
7782   /* In case that vectorization factor (VF) is bigger than the number
7783      of elements that we can fit in a vectype (nunits), we have to generate
7784      more than one vector stmt - i.e - we need to "unroll" the
7785      vector stmt by a factor VF/nunits.  For more details see documentation
7786      in vectorizable_operation.  */
7787
7788   if (ncopies > 1)
7789     {
7790       gimple_seq seq = NULL;
7791       stmt_vec_info prev_stmt_vinfo;
7792       /* FORNOW. This restriction should be relaxed.  */
7793       gcc_assert (!nested_in_vect_loop);
7794
7795       /* Create the vector that holds the step of the induction.  */
7796       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7797         {
7798           expr = build_int_cst (integer_type_node, nunits);
7799           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7800         }
7801       else
7802         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
7803       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7804                                expr, step_expr);
7805       if (seq)
7806         {
7807           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7808           gcc_assert (!new_bb);
7809         }
7810
7811       t = unshare_expr (new_name);
7812       gcc_assert (CONSTANT_CLASS_P (new_name)
7813                   || TREE_CODE (new_name) == SSA_NAME);
7814       new_vec = build_vector_from_val (vectype, t);
7815       vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7816
7817       vec_def = induc_def;
7818       prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
7819       for (i = 1; i < ncopies; i++)
7820         {
7821           /* vec_i = vec_prev + vec_step  */
7822           new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
7823                                           vec_def, vec_step);
7824           vec_def = make_ssa_name (vec_dest, new_stmt);
7825           gimple_assign_set_lhs (new_stmt, vec_def);
7826
7827           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7828           set_vinfo_for_stmt (new_stmt,
7829                               new_stmt_vec_info (new_stmt, loop_vinfo));
7830           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
7831           prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
7832         }
7833     }
7834
7835   if (nested_in_vect_loop)
7836     {
7837       /* Find the loop-closed exit-phi of the induction, and record
7838          the final vector of induction results:  */
7839       exit_phi = NULL;
7840       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7841         {
7842           gimple *use_stmt = USE_STMT (use_p);
7843           if (is_gimple_debug (use_stmt))
7844             continue;
7845
7846           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
7847             {
7848               exit_phi = use_stmt;
7849               break;
7850             }
7851         }
7852       if (exit_phi)
7853         {
7854           stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
7855           /* FORNOW. Currently not supporting the case that an inner-loop induction
7856              is not used in the outer-loop (i.e. only outside the outer-loop).  */
7857           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
7858                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
7859
7860           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
7861           if (dump_enabled_p ())
7862             {
7863               dump_printf_loc (MSG_NOTE, vect_location,
7864                                "vector of inductions after inner-loop:");
7865               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
7866             }
7867         }
7868     }
7869
7870
7871   if (dump_enabled_p ())
7872     {
7873       dump_printf_loc (MSG_NOTE, vect_location,
7874                        "transform induction: created def-use cycle: ");
7875       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
7876       dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
7877                         SSA_NAME_DEF_STMT (vec_def), 0);
7878     }
7879
7880   return true;
7881 }
7882
7883 /* Function vectorizable_live_operation.
7884
7885    STMT computes a value that is used outside the loop.  Check if
7886    it can be supported.  */
7887
7888 bool
7889 vectorizable_live_operation (gimple *stmt,
7890                              gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7891                              slp_tree slp_node, int slp_index,
7892                              gimple **vec_stmt,
7893                              stmt_vector_for_cost *)
7894 {
7895   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
7896   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7897   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7898   imm_use_iterator imm_iter;
7899   tree lhs, lhs_type, bitsize, vec_bitsize;
7900   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7901   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7902   int ncopies;
7903   gimple *use_stmt;
7904   auto_vec<tree> vec_oprnds;
7905   int vec_entry = 0;
7906   poly_uint64 vec_index = 0;
7907
7908   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
7909
7910   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
7911     return false;
7912
7913   /* FORNOW.  CHECKME.  */
7914   if (nested_in_vect_loop_p (loop, stmt))
7915     return false;
7916
7917   /* If STMT is not relevant and it is a simple assignment and its inputs are
7918      invariant then it can remain in place, unvectorized.  The original last
7919      scalar value that it computes will be used.  */
7920   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7921     {
7922       gcc_assert (is_simple_and_all_uses_invariant (stmt, loop_vinfo));
7923       if (dump_enabled_p ())
7924         dump_printf_loc (MSG_NOTE, vect_location,
7925                          "statement is simple and uses invariant.  Leaving in "
7926                          "place.\n");
7927       return true;
7928     }
7929
7930   if (slp_node)
7931     ncopies = 1;
7932   else
7933     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7934
7935   if (slp_node)
7936     {
7937       gcc_assert (slp_index >= 0);
7938
7939       int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7940       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7941
7942       /* Get the last occurrence of the scalar index from the concatenation of
7943          all the slp vectors. Calculate which slp vector it is and the index
7944          within.  */
7945       poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
7946
7947       /* Calculate which vector contains the result, and which lane of
7948          that vector we need.  */
7949       if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
7950         {
7951           if (dump_enabled_p ())
7952             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7953                              "Cannot determine which vector holds the"
7954                              " final result.\n");
7955           return false;
7956         }
7957     }
7958
7959   if (!vec_stmt)
7960     {
7961       /* No transformation required.  */
7962       if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
7963         {
7964           if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
7965                                                OPTIMIZE_FOR_SPEED))
7966             {
7967               if (dump_enabled_p ())
7968                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7969                                  "can't use a fully-masked loop because "
7970                                  "the target doesn't support extract last "
7971                                  "reduction.\n");
7972               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7973             }
7974           else if (slp_node)
7975             {
7976               if (dump_enabled_p ())
7977                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7978                                  "can't use a fully-masked loop because an "
7979                                  "SLP statement is live after the loop.\n");
7980               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7981             }
7982           else if (ncopies > 1)
7983             {
7984               if (dump_enabled_p ())
7985                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7986                                  "can't use a fully-masked loop because"
7987                                  " ncopies is greater than 1.\n");
7988               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7989             }
7990           else
7991             {
7992               gcc_assert (ncopies == 1 && !slp_node);
7993               vect_record_loop_mask (loop_vinfo,
7994                                      &LOOP_VINFO_MASKS (loop_vinfo),
7995                                      1, vectype);
7996             }
7997         }
7998       return true;
7999     }
8000
8001   /* If stmt has a related stmt, then use that for getting the lhs.  */
8002   if (is_pattern_stmt_p (stmt_info))
8003     stmt = STMT_VINFO_RELATED_STMT (stmt_info);
8004
8005   lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
8006         : gimple_get_lhs (stmt);
8007   lhs_type = TREE_TYPE (lhs);
8008
8009   bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype)
8010              ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype)))
8011              : TYPE_SIZE (TREE_TYPE (vectype)));
8012   vec_bitsize = TYPE_SIZE (vectype);
8013
8014   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
8015   tree vec_lhs, bitstart;
8016   if (slp_node)
8017     {
8018       gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8019
8020       /* Get the correct slp vectorized stmt.  */
8021       gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry];
8022       if (gphi *phi = dyn_cast <gphi *> (vec_stmt))
8023         vec_lhs = gimple_phi_result (phi);
8024       else
8025         vec_lhs = gimple_get_lhs (vec_stmt);
8026
8027       /* Get entry to use.  */
8028       bitstart = bitsize_int (vec_index);
8029       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
8030     }
8031   else
8032     {
8033       enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
8034       vec_lhs = vect_get_vec_def_for_operand_1 (stmt, dt);
8035       gcc_checking_assert (ncopies == 1
8036                            || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8037
8038       /* For multiple copies, get the last copy.  */
8039       for (int i = 1; i < ncopies; ++i)
8040         vec_lhs = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type,
8041                                                   vec_lhs);
8042
8043       /* Get the last lane in the vector.  */
8044       bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
8045     }
8046
8047   gimple_seq stmts = NULL;
8048   tree new_tree;
8049   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8050     {
8051       /* Emit:
8052
8053            SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
8054
8055          where VEC_LHS is the vectorized live-out result and MASK is
8056          the loop mask for the final iteration.  */
8057       gcc_assert (ncopies == 1 && !slp_node);
8058       tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
8059       tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
8060                                       1, vectype, 0);
8061       tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST,
8062                                       scalar_type, mask, vec_lhs);
8063
8064       /* Convert the extracted vector element to the required scalar type.  */
8065       new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
8066     }
8067   else
8068     {
8069       tree bftype = TREE_TYPE (vectype);
8070       if (VECTOR_BOOLEAN_TYPE_P (vectype))
8071         bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8072       new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs, bitsize, bitstart);
8073       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8074                                        &stmts, true, NULL_TREE);
8075     }
8076
8077   if (stmts)
8078     gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts);
8079
8080   /* Replace use of lhs with newly computed result.  If the use stmt is a
8081      single arg PHI, just replace all uses of PHI result.  It's necessary
8082      because lcssa PHI defining lhs may be before newly inserted stmt.  */
8083   use_operand_p use_p;
8084   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8085     if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
8086         && !is_gimple_debug (use_stmt))
8087     {
8088       if (gimple_code (use_stmt) == GIMPLE_PHI
8089           && gimple_phi_num_args (use_stmt) == 1)
8090         {
8091           replace_uses_by (gimple_phi_result (use_stmt), new_tree);
8092         }
8093       else
8094         {
8095           FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8096             SET_USE (use_p, new_tree);
8097         }
8098       update_stmt (use_stmt);
8099     }
8100
8101   return true;
8102 }
8103
8104 /* Kill any debug uses outside LOOP of SSA names defined in STMT.  */
8105
8106 static void
8107 vect_loop_kill_debug_uses (struct loop *loop, gimple *stmt)
8108 {
8109   ssa_op_iter op_iter;
8110   imm_use_iterator imm_iter;
8111   def_operand_p def_p;
8112   gimple *ustmt;
8113
8114   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
8115     {
8116       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
8117         {
8118           basic_block bb;
8119
8120           if (!is_gimple_debug (ustmt))
8121             continue;
8122
8123           bb = gimple_bb (ustmt);
8124
8125           if (!flow_bb_inside_loop_p (loop, bb))
8126             {
8127               if (gimple_debug_bind_p (ustmt))
8128                 {
8129                   if (dump_enabled_p ())
8130                     dump_printf_loc (MSG_NOTE, vect_location,
8131                                      "killing debug use\n");
8132
8133                   gimple_debug_bind_reset_value (ustmt);
8134                   update_stmt (ustmt);
8135                 }
8136               else
8137                 gcc_unreachable ();
8138             }
8139         }
8140     }
8141 }
8142
8143 /* Given loop represented by LOOP_VINFO, return true if computation of
8144    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
8145    otherwise.  */
8146
8147 static bool
8148 loop_niters_no_overflow (loop_vec_info loop_vinfo)
8149 {
8150   /* Constant case.  */
8151   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8152     {
8153       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
8154       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
8155
8156       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
8157       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
8158       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
8159         return true;
8160     }
8161
8162   widest_int max;
8163   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8164   /* Check the upper bound of loop niters.  */
8165   if (get_max_loop_iterations (loop, &max))
8166     {
8167       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
8168       signop sgn = TYPE_SIGN (type);
8169       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
8170       if (max < type_max)
8171         return true;
8172     }
8173   return false;
8174 }
8175
8176 /* Return a mask type with half the number of elements as TYPE.  */
8177
8178 tree
8179 vect_halve_mask_nunits (tree type)
8180 {
8181   poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (type), 2);
8182   return build_truth_vector_type (nunits, current_vector_size);
8183 }
8184
8185 /* Return a mask type with twice as many elements as TYPE.  */
8186
8187 tree
8188 vect_double_mask_nunits (tree type)
8189 {
8190   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (type) * 2;
8191   return build_truth_vector_type (nunits, current_vector_size);
8192 }
8193
8194 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8195    contain a sequence of NVECTORS masks that each control a vector of type
8196    VECTYPE.  */
8197
8198 void
8199 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
8200                        unsigned int nvectors, tree vectype)
8201 {
8202   gcc_assert (nvectors != 0);
8203   if (masks->length () < nvectors)
8204     masks->safe_grow_cleared (nvectors);
8205   rgroup_masks *rgm = &(*masks)[nvectors - 1];
8206   /* The number of scalars per iteration and the number of vectors are
8207      both compile-time constants.  */
8208   unsigned int nscalars_per_iter
8209     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8210                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8211   if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8212     {
8213       rgm->max_nscalars_per_iter = nscalars_per_iter;
8214       rgm->mask_type = build_same_sized_truth_vector_type (vectype);
8215     }
8216 }
8217
8218 /* Given a complete set of masks MASKS, extract mask number INDEX
8219    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8220    where 0 <= INDEX < NVECTORS.  Insert any set-up statements before GSI.
8221
8222    See the comment above vec_loop_masks for more details about the mask
8223    arrangement.  */
8224
8225 tree
8226 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
8227                     unsigned int nvectors, tree vectype, unsigned int index)
8228 {
8229   rgroup_masks *rgm = &(*masks)[nvectors - 1];
8230   tree mask_type = rgm->mask_type;
8231
8232   /* Populate the rgroup's mask array, if this is the first time we've
8233      used it.  */
8234   if (rgm->masks.is_empty ())
8235     {
8236       rgm->masks.safe_grow_cleared (nvectors);
8237       for (unsigned int i = 0; i < nvectors; ++i)
8238         {
8239           tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
8240           /* Provide a dummy definition until the real one is available.  */
8241           SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
8242           rgm->masks[i] = mask;
8243         }
8244     }
8245
8246   tree mask = rgm->masks[index];
8247   if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
8248                 TYPE_VECTOR_SUBPARTS (vectype)))
8249     {
8250       /* A loop mask for data type X can be reused for data type Y
8251          if X has N times more elements than Y and if Y's elements
8252          are N times bigger than X's.  In this case each sequence
8253          of N elements in the loop mask will be all-zero or all-one.
8254          We can then view-convert the mask so that each sequence of
8255          N elements is replaced by a single element.  */
8256       gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
8257                               TYPE_VECTOR_SUBPARTS (vectype)));
8258       gimple_seq seq = NULL;
8259       mask_type = build_same_sized_truth_vector_type (vectype);
8260       mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
8261       if (seq)
8262         gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
8263     }
8264   return mask;
8265 }
8266
8267 /* Scale profiling counters by estimation for LOOP which is vectorized
8268    by factor VF.  */
8269
8270 static void
8271 scale_profile_for_vect_loop (struct loop *loop, unsigned vf)
8272 {
8273   edge preheader = loop_preheader_edge (loop);
8274   /* Reduce loop iterations by the vectorization factor.  */
8275   gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
8276   profile_count freq_h = loop->header->count, freq_e = preheader->count ();
8277
8278   if (freq_h.nonzero_p ())
8279     {
8280       profile_probability p;
8281
8282       /* Avoid dropping loop body profile counter to 0 because of zero count
8283          in loop's preheader.  */
8284       if (!(freq_e == profile_count::zero ()))
8285         freq_e = freq_e.force_nonzero ();
8286       p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
8287       scale_loop_frequencies (loop, p);
8288     }
8289
8290   edge exit_e = single_exit (loop);
8291   exit_e->probability = profile_probability::always ()
8292                                  .apply_scale (1, new_est_niter + 1);
8293
8294   edge exit_l = single_pred_edge (loop->latch);
8295   profile_probability prob = exit_l->probability;
8296   exit_l->probability = exit_e->probability.invert ();
8297   if (prob.initialized_p () && exit_l->probability.initialized_p ())
8298     scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
8299 }
8300
8301 /* Vectorize STMT if relevant, inserting any new instructions before GSI.
8302    When vectorizing STMT as a store, set *SEEN_STORE to its stmt_vec_info.
8303    *SLP_SCHEDULE is a running record of whether we have called
8304    vect_schedule_slp.  */
8305
8306 static void
8307 vect_transform_loop_stmt (loop_vec_info loop_vinfo, gimple *stmt,
8308                           gimple_stmt_iterator *gsi,
8309                           stmt_vec_info *seen_store, bool *slp_scheduled)
8310 {
8311   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8312   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8313   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
8314   if (!stmt_info)
8315     return;
8316
8317   if (dump_enabled_p ())
8318     {
8319       dump_printf_loc (MSG_NOTE, vect_location,
8320                        "------>vectorizing statement: ");
8321       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
8322     }
8323
8324   if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8325     vect_loop_kill_debug_uses (loop, stmt);
8326
8327   if (!STMT_VINFO_RELEVANT_P (stmt_info)
8328       && !STMT_VINFO_LIVE_P (stmt_info))
8329     return;
8330
8331   if (STMT_VINFO_VECTYPE (stmt_info))
8332     {
8333       poly_uint64 nunits
8334         = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8335       if (!STMT_SLP_TYPE (stmt_info)
8336           && maybe_ne (nunits, vf)
8337           && dump_enabled_p ())
8338         /* For SLP VF is set according to unrolling factor, and not
8339            to vector size, hence for SLP this print is not valid.  */
8340         dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8341     }
8342
8343   /* SLP.  Schedule all the SLP instances when the first SLP stmt is
8344      reached.  */
8345   if (STMT_SLP_TYPE (stmt_info))
8346     {
8347       if (!*slp_scheduled)
8348         {
8349           *slp_scheduled = true;
8350
8351           DUMP_VECT_SCOPE ("scheduling SLP instances");
8352
8353           vect_schedule_slp (loop_vinfo);
8354         }
8355
8356       /* Hybrid SLP stmts must be vectorized in addition to SLP.  */
8357       if (PURE_SLP_STMT (stmt_info))
8358         return;
8359     }
8360
8361   if (dump_enabled_p ())
8362     dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
8363
8364   bool grouped_store = false;
8365   if (vect_transform_stmt (stmt, gsi, &grouped_store, NULL, NULL))
8366     *seen_store = stmt_info;
8367 }
8368
8369 /* Function vect_transform_loop.
8370
8371    The analysis phase has determined that the loop is vectorizable.
8372    Vectorize the loop - created vectorized stmts to replace the scalar
8373    stmts in the loop, and update the loop exit condition.
8374    Returns scalar epilogue loop if any.  */
8375
8376 struct loop *
8377 vect_transform_loop (loop_vec_info loop_vinfo)
8378 {
8379   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8380   struct loop *epilogue = NULL;
8381   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8382   int nbbs = loop->num_nodes;
8383   int i;
8384   tree niters_vector = NULL_TREE;
8385   tree step_vector = NULL_TREE;
8386   tree niters_vector_mult_vf = NULL_TREE;
8387   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8388   unsigned int lowest_vf = constant_lower_bound (vf);
8389   bool slp_scheduled = false;
8390   gimple *stmt;
8391   bool check_profitability = false;
8392   unsigned int th;
8393
8394   DUMP_VECT_SCOPE ("vec_transform_loop");
8395
8396   loop_vinfo->shared->check_datarefs ();
8397
8398   /* Use the more conservative vectorization threshold.  If the number
8399      of iterations is constant assume the cost check has been performed
8400      by our caller.  If the threshold makes all loops profitable that
8401      run at least the (estimated) vectorization factor number of times
8402      checking is pointless, too.  */
8403   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
8404   if (th >= vect_vf_for_cost (loop_vinfo)
8405       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8406     {
8407       if (dump_enabled_p ())
8408         dump_printf_loc (MSG_NOTE, vect_location,
8409                          "Profitability threshold is %d loop iterations.\n",
8410                          th);
8411       check_profitability = true;
8412     }
8413
8414   /* Make sure there exists a single-predecessor exit bb.  Do this before
8415      versioning.   */
8416   edge e = single_exit (loop);
8417   if (! single_pred_p (e->dest))
8418     {
8419       split_loop_exit_edge (e);
8420       if (dump_enabled_p ())
8421         dump_printf (MSG_NOTE, "split exit edge\n");
8422     }
8423
8424   /* Version the loop first, if required, so the profitability check
8425      comes first.  */
8426
8427   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
8428     {
8429       poly_uint64 versioning_threshold
8430         = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
8431       if (check_profitability
8432           && ordered_p (poly_uint64 (th), versioning_threshold))
8433         {
8434           versioning_threshold = ordered_max (poly_uint64 (th),
8435                                               versioning_threshold);
8436           check_profitability = false;
8437         }
8438       vect_loop_versioning (loop_vinfo, th, check_profitability,
8439                             versioning_threshold);
8440       check_profitability = false;
8441     }
8442
8443   /* Make sure there exists a single-predecessor exit bb also on the
8444      scalar loop copy.  Do this after versioning but before peeling
8445      so CFG structure is fine for both scalar and if-converted loop
8446      to make slpeel_duplicate_current_defs_from_edges face matched
8447      loop closed PHI nodes on the exit.  */
8448   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8449     {
8450       e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
8451       if (! single_pred_p (e->dest))
8452         {
8453           split_loop_exit_edge (e);
8454           if (dump_enabled_p ())
8455             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
8456         }
8457     }
8458
8459   tree niters = vect_build_loop_niters (loop_vinfo);
8460   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
8461   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
8462   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
8463   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
8464                               &step_vector, &niters_vector_mult_vf, th,
8465                               check_profitability, niters_no_overflow);
8466
8467   if (niters_vector == NULL_TREE)
8468     {
8469       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8470           && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8471           && known_eq (lowest_vf, vf))
8472         {
8473           niters_vector
8474             = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
8475                              LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
8476           step_vector = build_one_cst (TREE_TYPE (niters));
8477         }
8478       else
8479         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
8480                                      &step_vector, niters_no_overflow);
8481     }
8482
8483   /* 1) Make sure the loop header has exactly two entries
8484      2) Make sure we have a preheader basic block.  */
8485
8486   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
8487
8488   split_edge (loop_preheader_edge (loop));
8489
8490   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8491       && vect_use_loop_mask_for_alignment_p (loop_vinfo))
8492     /* This will deal with any possible peeling.  */
8493     vect_prepare_for_masked_peels (loop_vinfo);
8494
8495   /* FORNOW: the vectorizer supports only loops which body consist
8496      of one basic block (header + empty latch). When the vectorizer will
8497      support more involved loop forms, the order by which the BBs are
8498      traversed need to be reconsidered.  */
8499
8500   for (i = 0; i < nbbs; i++)
8501     {
8502       basic_block bb = bbs[i];
8503       stmt_vec_info stmt_info;
8504
8505       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
8506            gsi_next (&si))
8507         {
8508           gphi *phi = si.phi ();
8509           if (dump_enabled_p ())
8510             {
8511               dump_printf_loc (MSG_NOTE, vect_location,
8512                                "------>vectorizing phi: ");
8513               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
8514             }
8515           stmt_info = vinfo_for_stmt (phi);
8516           if (!stmt_info)
8517             continue;
8518
8519           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8520             vect_loop_kill_debug_uses (loop, phi);
8521
8522           if (!STMT_VINFO_RELEVANT_P (stmt_info)
8523               && !STMT_VINFO_LIVE_P (stmt_info))
8524             continue;
8525
8526           if (STMT_VINFO_VECTYPE (stmt_info)
8527               && (maybe_ne
8528                   (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
8529               && dump_enabled_p ())
8530             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8531
8532           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
8533                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8534                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
8535               && ! PURE_SLP_STMT (stmt_info))
8536             {
8537               if (dump_enabled_p ())
8538                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
8539               vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
8540             }
8541         }
8542
8543       for (gimple_stmt_iterator si = gsi_start_bb (bb);
8544            !gsi_end_p (si);)
8545         {
8546           stmt = gsi_stmt (si);
8547           /* During vectorization remove existing clobber stmts.  */
8548           if (gimple_clobber_p (stmt))
8549             {
8550               unlink_stmt_vdef (stmt);
8551               gsi_remove (&si, true);
8552               release_defs (stmt);
8553             }
8554           else
8555             {
8556               stmt_info = vinfo_for_stmt (stmt);
8557
8558               /* vector stmts created in the outer-loop during vectorization of
8559                  stmts in an inner-loop may not have a stmt_info, and do not
8560                  need to be vectorized.  */
8561               stmt_vec_info seen_store = NULL;
8562               if (stmt_info)
8563                 {
8564                   if (STMT_VINFO_IN_PATTERN_P (stmt_info))
8565                     {
8566                       gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
8567                       for (gimple_stmt_iterator subsi = gsi_start (def_seq);
8568                            !gsi_end_p (subsi); gsi_next (&subsi))
8569                         vect_transform_loop_stmt (loop_vinfo,
8570                                                   gsi_stmt (subsi), &si,
8571                                                   &seen_store,
8572                                                   &slp_scheduled);
8573                       gimple *pat_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
8574                       vect_transform_loop_stmt (loop_vinfo, pat_stmt, &si,
8575                                                 &seen_store, &slp_scheduled);
8576                     }
8577                   vect_transform_loop_stmt (loop_vinfo, stmt, &si,
8578                                             &seen_store, &slp_scheduled);
8579                 }
8580               if (seen_store)
8581                 {
8582                   if (STMT_VINFO_GROUPED_ACCESS (seen_store))
8583                     {
8584                       /* Interleaving.  If IS_STORE is TRUE, the
8585                          vectorization of the interleaving chain was
8586                          completed - free all the stores in the chain.  */
8587                       gsi_next (&si);
8588                       vect_remove_stores (DR_GROUP_FIRST_ELEMENT (seen_store));
8589                     }
8590                   else
8591                     {
8592                       /* Free the attached stmt_vec_info and remove the
8593                          stmt.  */
8594                       free_stmt_vec_info (stmt);
8595                       unlink_stmt_vdef (stmt);
8596                       gsi_remove (&si, true);
8597                       release_defs (stmt);
8598                     }
8599                 }
8600               else
8601                 gsi_next (&si);
8602             }
8603         }
8604
8605       /* Stub out scalar statements that must not survive vectorization.
8606          Doing this here helps with grouped statements, or statements that
8607          are involved in patterns.  */
8608       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
8609            !gsi_end_p (gsi); gsi_next (&gsi))
8610         {
8611           gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
8612           if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
8613             {
8614               tree lhs = gimple_get_lhs (call);
8615               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8616                 {
8617                   tree zero = build_zero_cst (TREE_TYPE (lhs));
8618                   gimple *new_stmt = gimple_build_assign (lhs, zero);
8619                   gsi_replace (&gsi, new_stmt, true);
8620                 }
8621             }
8622         }
8623     }                           /* BBs in loop */
8624
8625   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
8626      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
8627   if (integer_onep (step_vector))
8628     niters_no_overflow = true;
8629   vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
8630                            niters_vector_mult_vf, !niters_no_overflow);
8631
8632   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
8633   scale_profile_for_vect_loop (loop, assumed_vf);
8634
8635   /* True if the final iteration might not handle a full vector's
8636      worth of scalar iterations.  */
8637   bool final_iter_may_be_partial = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8638   /* The minimum number of iterations performed by the epilogue.  This
8639      is 1 when peeling for gaps because we always need a final scalar
8640      iteration.  */
8641   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
8642   /* +1 to convert latch counts to loop iteration counts,
8643      -min_epilogue_iters to remove iterations that cannot be performed
8644        by the vector code.  */
8645   int bias_for_lowest = 1 - min_epilogue_iters;
8646   int bias_for_assumed = bias_for_lowest;
8647   int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
8648   if (alignment_npeels && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8649     {
8650       /* When the amount of peeling is known at compile time, the first
8651          iteration will have exactly alignment_npeels active elements.
8652          In the worst case it will have at least one.  */
8653       int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
8654       bias_for_lowest += lowest_vf - min_first_active;
8655       bias_for_assumed += assumed_vf - min_first_active;
8656     }
8657   /* In these calculations the "- 1" converts loop iteration counts
8658      back to latch counts.  */
8659   if (loop->any_upper_bound)
8660     loop->nb_iterations_upper_bound
8661       = (final_iter_may_be_partial
8662          ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
8663                           lowest_vf) - 1
8664          : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
8665                            lowest_vf) - 1);
8666   if (loop->any_likely_upper_bound)
8667     loop->nb_iterations_likely_upper_bound
8668       = (final_iter_may_be_partial
8669          ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
8670                           + bias_for_lowest, lowest_vf) - 1
8671          : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
8672                            + bias_for_lowest, lowest_vf) - 1);
8673   if (loop->any_estimate)
8674     loop->nb_iterations_estimate
8675       = (final_iter_may_be_partial
8676          ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
8677                           assumed_vf) - 1
8678          : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
8679                            assumed_vf) - 1);
8680
8681   if (dump_enabled_p ())
8682     {
8683       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8684         {
8685           dump_printf_loc (MSG_NOTE, vect_location,
8686                            "LOOP VECTORIZED\n");
8687           if (loop->inner)
8688             dump_printf_loc (MSG_NOTE, vect_location,
8689                              "OUTER LOOP VECTORIZED\n");
8690           dump_printf (MSG_NOTE, "\n");
8691         }
8692       else
8693         {
8694           dump_printf_loc (MSG_NOTE, vect_location,
8695                            "LOOP EPILOGUE VECTORIZED (VS=");
8696           dump_dec (MSG_NOTE, current_vector_size);
8697           dump_printf (MSG_NOTE, ")\n");
8698         }
8699     }
8700
8701   /* Free SLP instances here because otherwise stmt reference counting
8702      won't work.  */
8703   slp_instance instance;
8704   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
8705     vect_free_slp_instance (instance);
8706   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
8707   /* Clear-up safelen field since its value is invalid after vectorization
8708      since vectorized loop can have loop-carried dependencies.  */
8709   loop->safelen = 0;
8710
8711   /* Don't vectorize epilogue for epilogue.  */
8712   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8713     epilogue = NULL;
8714
8715   if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
8716     epilogue = NULL;
8717
8718   if (epilogue)
8719     {
8720       auto_vector_sizes vector_sizes;
8721       targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
8722       unsigned int next_size = 0;
8723
8724       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8725           && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0
8726           && known_eq (vf, lowest_vf))
8727         {
8728           unsigned int eiters
8729             = (LOOP_VINFO_INT_NITERS (loop_vinfo)
8730                - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
8731           eiters = eiters % lowest_vf;
8732           epilogue->nb_iterations_upper_bound = eiters - 1;
8733
8734           unsigned int ratio;
8735           while (next_size < vector_sizes.length ()
8736                  && !(constant_multiple_p (current_vector_size,
8737                                            vector_sizes[next_size], &ratio)
8738                       && eiters >= lowest_vf / ratio))
8739             next_size += 1;
8740         }
8741       else
8742         while (next_size < vector_sizes.length ()
8743                && maybe_lt (current_vector_size, vector_sizes[next_size]))
8744           next_size += 1;
8745
8746       if (next_size == vector_sizes.length ())
8747         epilogue = NULL;
8748     }
8749
8750   if (epilogue)
8751     {
8752       epilogue->force_vectorize = loop->force_vectorize;
8753       epilogue->safelen = loop->safelen;
8754       epilogue->dont_vectorize = false;
8755
8756       /* We may need to if-convert epilogue to vectorize it.  */
8757       if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8758         tree_if_conversion (epilogue);
8759     }
8760
8761   return epilogue;
8762 }
8763
8764 /* The code below is trying to perform simple optimization - revert
8765    if-conversion for masked stores, i.e. if the mask of a store is zero
8766    do not perform it and all stored value producers also if possible.
8767    For example,
8768      for (i=0; i<n; i++)
8769        if (c[i])
8770         {
8771           p1[i] += 1;
8772           p2[i] = p3[i] +2;
8773         }
8774    this transformation will produce the following semi-hammock:
8775
8776    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
8777      {
8778        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
8779        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
8780        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
8781        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
8782        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
8783        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
8784      }
8785 */
8786
8787 void
8788 optimize_mask_stores (struct loop *loop)
8789 {
8790   basic_block *bbs = get_loop_body (loop);
8791   unsigned nbbs = loop->num_nodes;
8792   unsigned i;
8793   basic_block bb;
8794   struct loop *bb_loop;
8795   gimple_stmt_iterator gsi;
8796   gimple *stmt;
8797   auto_vec<gimple *> worklist;
8798
8799   vect_location = find_loop_location (loop);
8800   /* Pick up all masked stores in loop if any.  */
8801   for (i = 0; i < nbbs; i++)
8802     {
8803       bb = bbs[i];
8804       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
8805            gsi_next (&gsi))
8806         {
8807           stmt = gsi_stmt (gsi);
8808           if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
8809             worklist.safe_push (stmt);
8810         }
8811     }
8812
8813   free (bbs);
8814   if (worklist.is_empty ())
8815     return;
8816
8817   /* Loop has masked stores.  */
8818   while (!worklist.is_empty ())
8819     {
8820       gimple *last, *last_store;
8821       edge e, efalse;
8822       tree mask;
8823       basic_block store_bb, join_bb;
8824       gimple_stmt_iterator gsi_to;
8825       tree vdef, new_vdef;
8826       gphi *phi;
8827       tree vectype;
8828       tree zero;
8829
8830       last = worklist.pop ();
8831       mask = gimple_call_arg (last, 2);
8832       bb = gimple_bb (last);
8833       /* Create then_bb and if-then structure in CFG, then_bb belongs to
8834          the same loop as if_bb.  It could be different to LOOP when two
8835          level loop-nest is vectorized and mask_store belongs to the inner
8836          one.  */
8837       e = split_block (bb, last);
8838       bb_loop = bb->loop_father;
8839       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
8840       join_bb = e->dest;
8841       store_bb = create_empty_bb (bb);
8842       add_bb_to_loop (store_bb, bb_loop);
8843       e->flags = EDGE_TRUE_VALUE;
8844       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
8845       /* Put STORE_BB to likely part.  */
8846       efalse->probability = profile_probability::unlikely ();
8847       store_bb->count = efalse->count ();
8848       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
8849       if (dom_info_available_p (CDI_DOMINATORS))
8850         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
8851       if (dump_enabled_p ())
8852         dump_printf_loc (MSG_NOTE, vect_location,
8853                          "Create new block %d to sink mask stores.",
8854                          store_bb->index);
8855       /* Create vector comparison with boolean result.  */
8856       vectype = TREE_TYPE (mask);
8857       zero = build_zero_cst (vectype);
8858       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
8859       gsi = gsi_last_bb (bb);
8860       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
8861       /* Create new PHI node for vdef of the last masked store:
8862          .MEM_2 = VDEF <.MEM_1>
8863          will be converted to
8864          .MEM.3 = VDEF <.MEM_1>
8865          and new PHI node will be created in join bb
8866          .MEM_2 = PHI <.MEM_1, .MEM_3>
8867       */
8868       vdef = gimple_vdef (last);
8869       new_vdef = make_ssa_name (gimple_vop (cfun), last);
8870       gimple_set_vdef (last, new_vdef);
8871       phi = create_phi_node (vdef, join_bb);
8872       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
8873
8874       /* Put all masked stores with the same mask to STORE_BB if possible.  */
8875       while (true)
8876         {
8877           gimple_stmt_iterator gsi_from;
8878           gimple *stmt1 = NULL;
8879
8880           /* Move masked store to STORE_BB.  */
8881           last_store = last;
8882           gsi = gsi_for_stmt (last);
8883           gsi_from = gsi;
8884           /* Shift GSI to the previous stmt for further traversal.  */
8885           gsi_prev (&gsi);
8886           gsi_to = gsi_start_bb (store_bb);
8887           gsi_move_before (&gsi_from, &gsi_to);
8888           /* Setup GSI_TO to the non-empty block start.  */
8889           gsi_to = gsi_start_bb (store_bb);
8890           if (dump_enabled_p ())
8891             {
8892               dump_printf_loc (MSG_NOTE, vect_location,
8893                                "Move stmt to created bb\n");
8894               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, last, 0);
8895             }
8896           /* Move all stored value producers if possible.  */
8897           while (!gsi_end_p (gsi))
8898             {
8899               tree lhs;
8900               imm_use_iterator imm_iter;
8901               use_operand_p use_p;
8902               bool res;
8903
8904               /* Skip debug statements.  */
8905               if (is_gimple_debug (gsi_stmt (gsi)))
8906                 {
8907                   gsi_prev (&gsi);
8908                   continue;
8909                 }
8910               stmt1 = gsi_stmt (gsi);
8911               /* Do not consider statements writing to memory or having
8912                  volatile operand.  */
8913               if (gimple_vdef (stmt1)
8914                   || gimple_has_volatile_ops (stmt1))
8915                 break;
8916               gsi_from = gsi;
8917               gsi_prev (&gsi);
8918               lhs = gimple_get_lhs (stmt1);
8919               if (!lhs)
8920                 break;
8921
8922               /* LHS of vectorized stmt must be SSA_NAME.  */
8923               if (TREE_CODE (lhs) != SSA_NAME)
8924                 break;
8925
8926               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8927                 {
8928                   /* Remove dead scalar statement.  */
8929                   if (has_zero_uses (lhs))
8930                     {
8931                       gsi_remove (&gsi_from, true);
8932                       continue;
8933                     }
8934                 }
8935
8936               /* Check that LHS does not have uses outside of STORE_BB.  */
8937               res = true;
8938               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
8939                 {
8940                   gimple *use_stmt;
8941                   use_stmt = USE_STMT (use_p);
8942                   if (is_gimple_debug (use_stmt))
8943                     continue;
8944                   if (gimple_bb (use_stmt) != store_bb)
8945                     {
8946                       res = false;
8947                       break;
8948                     }
8949                 }
8950               if (!res)
8951                 break;
8952
8953               if (gimple_vuse (stmt1)
8954                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
8955                 break;
8956
8957               /* Can move STMT1 to STORE_BB.  */
8958               if (dump_enabled_p ())
8959                 {
8960                   dump_printf_loc (MSG_NOTE, vect_location,
8961                                    "Move stmt to created bb\n");
8962                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt1, 0);
8963                 }
8964               gsi_move_before (&gsi_from, &gsi_to);
8965               /* Shift GSI_TO for further insertion.  */
8966               gsi_prev (&gsi_to);
8967             }
8968           /* Put other masked stores with the same mask to STORE_BB.  */
8969           if (worklist.is_empty ()
8970               || gimple_call_arg (worklist.last (), 2) != mask
8971               || worklist.last () != stmt1)
8972             break;
8973           last = worklist.pop ();
8974         }
8975       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
8976     }
8977 }