gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2018 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "cfghooks.h"
  31 #include "tree-pass.h"
  32 #include "ssa.h"
  33 #include "optabs-tree.h"
  34 #include "diagnostic-core.h"
  35 #include "fold-const.h"
  36 #include "stor-layout.h"
  37 #include "cfganal.h"
  38 #include "gimplify.h"
  39 #include "gimple-iterator.h"
  40 #include "gimplify-me.h"
  41 #include "tree-ssa-loop-ivopts.h"
  42 #include "tree-ssa-loop-manip.h"
  43 #include "tree-ssa-loop-niter.h"
  44 #include "tree-ssa-loop.h"
  45 #include "cfgloop.h"
  46 #include "params.h"
  47 #include "tree-scalar-evolution.h"
  48 #include "tree-vectorizer.h"
  49 #include "gimple-fold.h"
  50 #include "cgraph.h"
  51 #include "tree-cfg.h"
  52 #include "tree-if-conv.h"
  53 #include "internal-fn.h"
  54 #include "tree-vector-builder.h"
  55 #include "vec-perm-indices.h"
  56 #include "tree-eh.h"
  57
  58 /* Loop Vectorization Pass.
  59
  60    This pass tries to vectorize loops.
  61
  62    For example, the vectorizer transforms the following simple loop:
  63
  64         short a[N]; short b[N]; short c[N]; int i;
  65
  66         for (i=0; i<N; i++){
  67           a[i] = b[i] + c[i];
  68         }
  69
  70    as if it was manually vectorized by rewriting the source code into:
  71
  72         typedef int __attribute__((mode(V8HI))) v8hi;
  73         short a[N];  short b[N]; short c[N];   int i;
  74         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  75         v8hi va, vb, vc;
  76
  77         for (i=0; i<N/8; i++){
  78           vb = pb[i];
  79           vc = pc[i];
  80           va = vb + vc;
  81           pa[i] = va;
  82         }
  83
  84         The main entry to this pass is vectorize_loops(), in which
  85    the vectorizer applies a set of analyses on a given set of loops,
  86    followed by the actual vectorization transformation for the loops that
  87    had successfully passed the analysis phase.
  88         Throughout this pass we make a distinction between two types of
  89    data: scalars (which are represented by SSA_NAMES), and memory references
  90    ("data-refs").  These two types of data require different handling both
  91    during analysis and transformation. The types of data-refs that the
  92    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  93    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  94    accesses are required to have a simple (consecutive) access pattern.
  95
  96    Analysis phase:
  97    ===============
  98         The driver for the analysis phase is vect_analyze_loop().
  99    It applies a set of analyses, some of which rely on the scalar evolution
 100    analyzer (scev) developed by Sebastian Pop.
 101
 102         During the analysis phase the vectorizer records some information
 103    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 104    loop, as well as general information about the loop as a whole, which is
 105    recorded in a "loop_vec_info" struct attached to each loop.
 106
 107    Transformation phase:
 108    =====================
 109         The loop transformation phase scans all the stmts in the loop, and
 110    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 111    the loop that needs to be vectorized.  It inserts the vector code sequence
 112    just before the scalar stmt S, and records a pointer to the vector code
 113    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 114    attached to S).  This pointer will be used for the vectorization of following
 115    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 116    otherwise, we rely on dead code elimination for removing it.
 117
 118         For example, say stmt S1 was vectorized into stmt VS1:
 119
 120    VS1: vb = px[i];
 121    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 122    S2:  a = b;
 123
 124    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 125    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 126    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 127    resulting sequence would be:
 128
 129    VS1: vb = px[i];
 130    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 131    VS2: va = vb;
 132    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 133
 134         Operands that are not SSA_NAMEs, are data-refs that appear in
 135    load/store operations (like 'x[i]' in S1), and are handled differently.
 136
 137    Target modeling:
 138    =================
 139         Currently the only target specific information that is used is the
 140    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 141    Targets that can support different sizes of vectors, for now will need
 142    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 143    flexibility will be added in the future.
 144
 145         Since we only vectorize operations which vector form can be
 146    expressed using existing tree codes, to verify that an operation is
 147    supported, the vectorizer checks the relevant optab at the relevant
 148    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 149    the value found is CODE_FOR_nothing, then there's no target support, and
 150    we can't vectorize the stmt.
 151
 152    For additional information on this project see:
 153    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 154 */
 155
 156 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 157
 158 /* Subroutine of vect_determine_vf_for_stmt that handles only one
 159    statement.  VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
 160    may already be set for general statements (not just data refs).  */
 161
 162 static bool
 163 vect_determine_vf_for_stmt_1 (stmt_vec_info stmt_info,
 164                               bool vectype_maybe_set_p,
 165                               poly_uint64 *vf,
 166                               vec<stmt_vec_info > *mask_producers)
 167 {
 168   gimple *stmt = stmt_info->stmt;
 169
 170   if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 171        && !STMT_VINFO_LIVE_P (stmt_info))
 172       || gimple_clobber_p (stmt))
 173     {
 174       if (dump_enabled_p ())
 175         dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 176       return true;
 177     }
 178
 179   tree stmt_vectype, nunits_vectype;
 180   if (!vect_get_vector_types_for_stmt (stmt_info, &stmt_vectype,
 181                                        &nunits_vectype))
 182     return false;
 183
 184   if (stmt_vectype)
 185     {
 186       if (STMT_VINFO_VECTYPE (stmt_info))
 187         /* The only case when a vectype had been already set is for stmts
 188            that contain a data ref, or for "pattern-stmts" (stmts generated
 189            by the vectorizer to represent/replace a certain idiom).  */
 190         gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
 191                      || vectype_maybe_set_p)
 192                     && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
 193       else if (stmt_vectype == boolean_type_node)
 194         mask_producers->safe_push (stmt_info);
 195       else
 196         STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
 197     }
 198
 199   if (nunits_vectype)
 200     vect_update_max_nunits (vf, nunits_vectype);
 201
 202   return true;
 203 }
 204
 205 /* Subroutine of vect_determine_vectorization_factor.  Set the vector
 206    types of STMT_INFO and all attached pattern statements and update
 207    the vectorization factor VF accordingly.  If some of the statements
 208    produce a mask result whose vector type can only be calculated later,
 209    add them to MASK_PRODUCERS.  Return true on success or false if
 210    something prevented vectorization.  */
 211
 212 static bool
 213 vect_determine_vf_for_stmt (stmt_vec_info stmt_info, poly_uint64 *vf,
 214                             vec<stmt_vec_info > *mask_producers)
 215 {
 216   if (dump_enabled_p ())
 217     {
 218       dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: ");
 219       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt_info->stmt, 0);
 220     }
 221   if (!vect_determine_vf_for_stmt_1 (stmt_info, false, vf, mask_producers))
 222     return false;
 223
 224   if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 225       && STMT_VINFO_RELATED_STMT (stmt_info))
 226     {
 227       gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 228       stmt_info = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
 229
 230       /* If a pattern statement has def stmts, analyze them too.  */
 231       for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
 232            !gsi_end_p (si); gsi_next (&si))
 233         {
 234           stmt_vec_info def_stmt_info = vinfo_for_stmt (gsi_stmt (si));
 235           if (dump_enabled_p ())
 236             {
 237               dump_printf_loc (MSG_NOTE, vect_location,
 238                                "==> examining pattern def stmt: ");
 239               dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
 240                                 def_stmt_info->stmt, 0);
 241             }
 242           if (!vect_determine_vf_for_stmt_1 (def_stmt_info, true,
 243                                              vf, mask_producers))
 244             return false;
 245         }
 246
 247       if (dump_enabled_p ())
 248         {
 249           dump_printf_loc (MSG_NOTE, vect_location,
 250                            "==> examining pattern statement: ");
 251           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt_info->stmt, 0);
 252         }
 253       if (!vect_determine_vf_for_stmt_1 (stmt_info, true, vf, mask_producers))
 254         return false;
 255     }
 256
 257   return true;
 258 }
 259
 260 /* Function vect_determine_vectorization_factor
 261
 262    Determine the vectorization factor (VF).  VF is the number of data elements
 263    that are operated upon in parallel in a single iteration of the vectorized
 264    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 265    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 266    elements can fit in a single vector register.
 267
 268    We currently support vectorization of loops in which all types operated upon
 269    are of the same size.  Therefore this function currently sets VF according to
 270    the size of the types operated upon, and fails if there are multiple sizes
 271    in the loop.
 272
 273    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 274    original loop:
 275         for (i=0; i<N; i++){
 276           a[i] = b[i] + c[i];
 277         }
 278
 279    vectorized loop:
 280         for (i=0; i<N; i+=VF){
 281           a[i:VF] = b[i:VF] + c[i:VF];
 282         }
 283 */
 284
 285 static bool
 286 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 287 {
 288   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 289   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 290   unsigned nbbs = loop->num_nodes;
 291   poly_uint64 vectorization_factor = 1;
 292   tree scalar_type = NULL_TREE;
 293   gphi *phi;
 294   tree vectype;
 295   stmt_vec_info stmt_info;
 296   unsigned i;
 297   auto_vec<stmt_vec_info> mask_producers;
 298
 299   DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
 300
 301   for (i = 0; i < nbbs; i++)
 302     {
 303       basic_block bb = bbs[i];
 304
 305       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 306            gsi_next (&si))
 307         {
 308           phi = si.phi ();
 309           stmt_info = vinfo_for_stmt (phi);
 310           if (dump_enabled_p ())
 311             {
 312               dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
 313               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 314             }
 315
 316           gcc_assert (stmt_info);
 317
 318           if (STMT_VINFO_RELEVANT_P (stmt_info)
 319               || STMT_VINFO_LIVE_P (stmt_info))
 320             {
 321               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 322               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 323
 324               if (dump_enabled_p ())
 325                 {
 326                   dump_printf_loc (MSG_NOTE, vect_location,
 327                                    "get vectype for scalar type:  ");
 328                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 329                   dump_printf (MSG_NOTE, "\n");
 330                 }
 331
 332               vectype = get_vectype_for_scalar_type (scalar_type);
 333               if (!vectype)
 334                 {
 335                   if (dump_enabled_p ())
 336                     {
 337                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 338                                        "not vectorized: unsupported "
 339                                        "data-type ");
 340                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 341                                          scalar_type);
 342                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 343                     }
 344                   return false;
 345                 }
 346               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 347
 348               if (dump_enabled_p ())
 349                 {
 350                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 351                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 352                   dump_printf (MSG_NOTE, "\n");
 353                 }
 354
 355               if (dump_enabled_p ())
 356                 {
 357                   dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
 358                   dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
 359                   dump_printf (MSG_NOTE, "\n");
 360                 }
 361
 362               vect_update_max_nunits (&vectorization_factor, vectype);
 363             }
 364         }
 365
 366       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
 367            gsi_next (&si))
 368         {
 369           stmt_info = vinfo_for_stmt (gsi_stmt (si));
 370           if (!vect_determine_vf_for_stmt (stmt_info, &vectorization_factor,
 371                                            &mask_producers))
 372             return false;
 373         }
 374     }
 375
 376   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 377   if (dump_enabled_p ())
 378     {
 379       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
 380       dump_dec (MSG_NOTE, vectorization_factor);
 381       dump_printf (MSG_NOTE, "\n");
 382     }
 383
 384   if (known_le (vectorization_factor, 1U))
 385     {
 386       if (dump_enabled_p ())
 387         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 388                          "not vectorized: unsupported data-type\n");
 389       return false;
 390     }
 391   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 392
 393   for (i = 0; i < mask_producers.length (); i++)
 394     {
 395       stmt_info = mask_producers[i];
 396       tree mask_type = vect_get_mask_type_for_stmt (stmt_info);
 397       if (!mask_type)
 398         return false;
 399       STMT_VINFO_VECTYPE (stmt_info) = mask_type;
 400     }
 401
 402   return true;
 403 }
 404
 405
 406 /* Function vect_is_simple_iv_evolution.
 407
 408    FORNOW: A simple evolution of an induction variables in the loop is
 409    considered a polynomial evolution.  */
 410
 411 static bool
 412 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 413                              tree * step)
 414 {
 415   tree init_expr;
 416   tree step_expr;
 417   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 418   basic_block bb;
 419
 420   /* When there is no evolution in this loop, the evolution function
 421      is not "simple".  */
 422   if (evolution_part == NULL_TREE)
 423     return false;
 424
 425   /* When the evolution is a polynomial of degree >= 2
 426      the evolution function is not "simple".  */
 427   if (tree_is_chrec (evolution_part))
 428     return false;
 429
 430   step_expr = evolution_part;
 431   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 432
 433   if (dump_enabled_p ())
 434     {
 435       dump_printf_loc (MSG_NOTE, vect_location, "step: ");
 436       dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
 437       dump_printf (MSG_NOTE, ",  init: ");
 438       dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
 439       dump_printf (MSG_NOTE, "\n");
 440     }
 441
 442   *init = init_expr;
 443   *step = step_expr;
 444
 445   if (TREE_CODE (step_expr) != INTEGER_CST
 446       && (TREE_CODE (step_expr) != SSA_NAME
 447           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 448               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 449           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 450               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 451                   || !flag_associative_math)))
 452       && (TREE_CODE (step_expr) != REAL_CST
 453           || !flag_associative_math))
 454     {
 455       if (dump_enabled_p ())
 456         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 457                          "step unknown.\n");
 458       return false;
 459     }
 460
 461   return true;
 462 }
 463
 464 /* Function vect_analyze_scalar_cycles_1.
 465
 466    Examine the cross iteration def-use cycles of scalar variables
 467    in LOOP.  LOOP_VINFO represents the loop that is now being
 468    considered for vectorization (can be LOOP, or an outer-loop
 469    enclosing LOOP).  */
 470
 471 static void
 472 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 473 {
 474   basic_block bb = loop->header;
 475   tree init, step;
 476   auto_vec<gimple *, 64> worklist;
 477   gphi_iterator gsi;
 478   bool double_reduc;
 479
 480   DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
 481
 482   /* First - identify all inductions.  Reduction detection assumes that all the
 483      inductions have been identified, therefore, this order must not be
 484      changed.  */
 485   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 486     {
 487       gphi *phi = gsi.phi ();
 488       tree access_fn = NULL;
 489       tree def = PHI_RESULT (phi);
 490       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 491
 492       if (dump_enabled_p ())
 493         {
 494           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 495           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 496         }
 497
 498       /* Skip virtual phi's.  The data dependences that are associated with
 499          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 500       if (virtual_operand_p (def))
 501         continue;
 502
 503       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 504
 505       /* Analyze the evolution function.  */
 506       access_fn = analyze_scalar_evolution (loop, def);
 507       if (access_fn)
 508         {
 509           STRIP_NOPS (access_fn);
 510           if (dump_enabled_p ())
 511             {
 512               dump_printf_loc (MSG_NOTE, vect_location,
 513                                "Access function of PHI: ");
 514               dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
 515               dump_printf (MSG_NOTE, "\n");
 516             }
 517           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 518             = initial_condition_in_loop_num (access_fn, loop->num);
 519           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 520             = evolution_part_in_loop_num (access_fn, loop->num);
 521         }
 522
 523       if (!access_fn
 524           || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
 525           || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 526               && TREE_CODE (step) != INTEGER_CST))
 527         {
 528           worklist.safe_push (phi);
 529           continue;
 530         }
 531
 532       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 533                   != NULL_TREE);
 534       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 535
 536       if (dump_enabled_p ())
 537         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 538       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 539     }
 540
 541
 542   /* Second - identify all reductions and nested cycles.  */
 543   while (worklist.length () > 0)
 544     {
 545       gimple *phi = worklist.pop ();
 546       tree def = PHI_RESULT (phi);
 547       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 548       gimple *reduc_stmt;
 549
 550       if (dump_enabled_p ())
 551         {
 552           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 553           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 554         }
 555
 556       gcc_assert (!virtual_operand_p (def)
 557                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 558
 559       reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi,
 560                                                 &double_reduc, false);
 561       if (reduc_stmt)
 562         {
 563           if (double_reduc)
 564             {
 565               if (dump_enabled_p ())
 566                 dump_printf_loc (MSG_NOTE, vect_location,
 567                                  "Detected double reduction.\n");
 568
 569               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 570               STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 571                                                     vect_double_reduction_def;
 572             }
 573           else
 574             {
 575               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
 576                 {
 577                   if (dump_enabled_p ())
 578                     dump_printf_loc (MSG_NOTE, vect_location,
 579                                      "Detected vectorizable nested cycle.\n");
 580
 581                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 582                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 583                                                              vect_nested_cycle;
 584                 }
 585               else
 586                 {
 587                   if (dump_enabled_p ())
 588                     dump_printf_loc (MSG_NOTE, vect_location,
 589                                      "Detected reduction.\n");
 590
 591                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 592                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 593                                                            vect_reduction_def;
 594                   /* Store the reduction cycles for possible vectorization in
 595                      loop-aware SLP if it was not detected as reduction
 596                      chain.  */
 597                   if (! REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (reduc_stmt)))
 598                     LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
 599                 }
 600             }
 601         }
 602       else
 603         if (dump_enabled_p ())
 604           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 605                            "Unknown def-use cycle pattern.\n");
 606     }
 607 }
 608
 609
 610 /* Function vect_analyze_scalar_cycles.
 611
 612    Examine the cross iteration def-use cycles of scalar variables, by
 613    analyzing the loop-header PHIs of scalar variables.  Classify each
 614    cycle as one of the following: invariant, induction, reduction, unknown.
 615    We do that for the loop represented by LOOP_VINFO, and also to its
 616    inner-loop, if exists.
 617    Examples for scalar cycles:
 618
 619    Example1: reduction:
 620
 621               loop1:
 622               for (i=0; i<N; i++)
 623                  sum += a[i];
 624
 625    Example2: induction:
 626
 627               loop2:
 628               for (i=0; i<N; i++)
 629                  a[i] = i;  */
 630
 631 static void
 632 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 633 {
 634   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 635
 636   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 637
 638   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 639      Reductions in such inner-loop therefore have different properties than
 640      the reductions in the nest that gets vectorized:
 641      1. When vectorized, they are executed in the same order as in the original
 642         scalar loop, so we can't change the order of computation when
 643         vectorizing them.
 644      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 645         current checks are too strict.  */
 646
 647   if (loop->inner)
 648     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 649 }
 650
 651 /* Transfer group and reduction information from STMT to its pattern stmt.  */
 652
 653 static void
 654 vect_fixup_reduc_chain (gimple *stmt)
 655 {
 656   gimple *firstp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 657   gimple *stmtp;
 658   gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (firstp))
 659               && REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
 660   REDUC_GROUP_SIZE (vinfo_for_stmt (firstp))
 661     = REDUC_GROUP_SIZE (vinfo_for_stmt (stmt));
 662   do
 663     {
 664       stmtp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 665       REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmtp)) = firstp;
 666       stmt = REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
 667       if (stmt)
 668         REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmtp))
 669           = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 670     }
 671   while (stmt);
 672   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmtp)) = vect_reduction_def;
 673 }
 674
 675 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 676
 677 static void
 678 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 679 {
 680   gimple *first;
 681   unsigned i;
 682
 683   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 684     if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (first)))
 685       {
 686         gimple *next = REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
 687         while (next)
 688           {
 689             if (! STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next)))
 690               break;
 691             next = REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
 692           }
 693         /* If not all stmt in the chain are patterns try to handle
 694            the chain without patterns.  */
 695         if (! next)
 696           {
 697             vect_fixup_reduc_chain (first);
 698             LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
 699               = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first));
 700           }
 701       }
 702 }
 703
 704 /* Function vect_get_loop_niters.
 705
 706    Determine how many iterations the loop is executed and place it
 707    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
 708    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
 709    niter information holds in ASSUMPTIONS.
 710
 711    Return the loop exit condition.  */
 712
 713
 714 static gcond *
 715 vect_get_loop_niters (struct loop *loop, tree *assumptions,
 716                       tree *number_of_iterations, tree *number_of_iterationsm1)
 717 {
 718   edge exit = single_exit (loop);
 719   struct tree_niter_desc niter_desc;
 720   tree niter_assumptions, niter, may_be_zero;
 721   gcond *cond = get_loop_exit_condition (loop);
 722
 723   *assumptions = boolean_true_node;
 724   *number_of_iterationsm1 = chrec_dont_know;
 725   *number_of_iterations = chrec_dont_know;
 726   DUMP_VECT_SCOPE ("get_loop_niters");
 727
 728   if (!exit)
 729     return cond;
 730
 731   niter = chrec_dont_know;
 732   may_be_zero = NULL_TREE;
 733   niter_assumptions = boolean_true_node;
 734   if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
 735       || chrec_contains_undetermined (niter_desc.niter))
 736     return cond;
 737
 738   niter_assumptions = niter_desc.assumptions;
 739   may_be_zero = niter_desc.may_be_zero;
 740   niter = niter_desc.niter;
 741
 742   if (may_be_zero && integer_zerop (may_be_zero))
 743     may_be_zero = NULL_TREE;
 744
 745   if (may_be_zero)
 746     {
 747       if (COMPARISON_CLASS_P (may_be_zero))
 748         {
 749           /* Try to combine may_be_zero with assumptions, this can simplify
 750              computation of niter expression.  */
 751           if (niter_assumptions && !integer_nonzerop (niter_assumptions))
 752             niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
 753                                              niter_assumptions,
 754                                              fold_build1 (TRUTH_NOT_EXPR,
 755                                                           boolean_type_node,
 756                                                           may_be_zero));
 757           else
 758             niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
 759                                  build_int_cst (TREE_TYPE (niter), 0),
 760                                  rewrite_to_non_trapping_overflow (niter));
 761
 762           may_be_zero = NULL_TREE;
 763         }
 764       else if (integer_nonzerop (may_be_zero))
 765         {
 766           *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
 767           *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
 768           return cond;
 769         }
 770       else
 771         return cond;
 772     }
 773
 774   *assumptions = niter_assumptions;
 775   *number_of_iterationsm1 = niter;
 776
 777   /* We want the number of loop header executions which is the number
 778      of latch executions plus one.
 779      ???  For UINT_MAX latch executions this number overflows to zero
 780      for loops like do { n++; } while (n != 0);  */
 781   if (niter && !chrec_contains_undetermined (niter))
 782     niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
 783                           build_int_cst (TREE_TYPE (niter), 1));
 784   *number_of_iterations = niter;
 785
 786   return cond;
 787 }
 788
 789 /* Function bb_in_loop_p
 790
 791    Used as predicate for dfs order traversal of the loop bbs.  */
 792
 793 static bool
 794 bb_in_loop_p (const_basic_block bb, const void *data)
 795 {
 796   const struct loop *const loop = (const struct loop *)data;
 797   if (flow_bb_inside_loop_p (loop, bb))
 798     return true;
 799   return false;
 800 }
 801
 802
 803 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
 804    stmt_vec_info structs for all the stmts in LOOP_IN.  */
 805
 806 _loop_vec_info::_loop_vec_info (struct loop *loop_in, vec_info_shared *shared)
 807   : vec_info (vec_info::loop, init_cost (loop_in), shared),
 808     loop (loop_in),
 809     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
 810     num_itersm1 (NULL_TREE),
 811     num_iters (NULL_TREE),
 812     num_iters_unchanged (NULL_TREE),
 813     num_iters_assumptions (NULL_TREE),
 814     th (0),
 815     versioning_threshold (0),
 816     vectorization_factor (0),
 817     max_vectorization_factor (0),
 818     mask_skip_niters (NULL_TREE),
 819     mask_compare_type (NULL_TREE),
 820     unaligned_dr (NULL),
 821     peeling_for_alignment (0),
 822     ptr_mask (0),
 823     ivexpr_map (NULL),
 824     slp_unrolling_factor (1),
 825     single_scalar_iteration_cost (0),
 826     vectorizable (false),
 827     can_fully_mask_p (true),
 828     fully_masked_p (false),
 829     peeling_for_gaps (false),
 830     peeling_for_niter (false),
 831     operands_swapped (false),
 832     no_data_dependencies (false),
 833     has_mask_store (false),
 834     scalar_loop (NULL),
 835     orig_loop_info (NULL)
 836 {
 837   /* Create/Update stmt_info for all stmts in the loop.  */
 838   basic_block *body = get_loop_body (loop);
 839   for (unsigned int i = 0; i < loop->num_nodes; i++)
 840     {
 841       basic_block bb = body[i];
 842       gimple_stmt_iterator si;
 843
 844       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 845         {
 846           gimple *phi = gsi_stmt (si);
 847           gimple_set_uid (phi, 0);
 848           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, this));
 849         }
 850
 851       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 852         {
 853           gimple *stmt = gsi_stmt (si);
 854           gimple_set_uid (stmt, 0);
 855           set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, this));
 856         }
 857     }
 858   free (body);
 859
 860   /* CHECKME: We want to visit all BBs before their successors (except for
 861      latch blocks, for which this assertion wouldn't hold).  In the simple
 862      case of the loop forms we allow, a dfs order of the BBs would the same
 863      as reversed postorder traversal, so we are safe.  */
 864
 865   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 866                                           bbs, loop->num_nodes, loop);
 867   gcc_assert (nbbs == loop->num_nodes);
 868 }
 869
 870 /* Free all levels of MASKS.  */
 871
 872 void
 873 release_vec_loop_masks (vec_loop_masks *masks)
 874 {
 875   rgroup_masks *rgm;
 876   unsigned int i;
 877   FOR_EACH_VEC_ELT (*masks, i, rgm)
 878     rgm->masks.release ();
 879   masks->release ();
 880 }
 881
 882 /* Free all memory used by the _loop_vec_info, as well as all the
 883    stmt_vec_info structs of all the stmts in the loop.  */
 884
 885 _loop_vec_info::~_loop_vec_info ()
 886 {
 887   int nbbs;
 888   gimple_stmt_iterator si;
 889   int j;
 890
 891   /* ???  We're releasing loop_vinfos en-block.  */
 892   set_stmt_vec_info_vec (&stmt_vec_infos);
 893   nbbs = loop->num_nodes;
 894   for (j = 0; j < nbbs; j++)
 895     {
 896       basic_block bb = bbs[j];
 897       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 898         free_stmt_vec_info (gsi_stmt (si));
 899
 900       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
 901         {
 902           gimple *stmt = gsi_stmt (si);
 903
 904           /* We may have broken canonical form by moving a constant
 905              into RHS1 of a commutative op.  Fix such occurrences.  */
 906           if (operands_swapped && is_gimple_assign (stmt))
 907             {
 908               enum tree_code code = gimple_assign_rhs_code (stmt);
 909
 910               if ((code == PLUS_EXPR
 911                    || code == POINTER_PLUS_EXPR
 912                    || code == MULT_EXPR)
 913                   && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
 914                 swap_ssa_operands (stmt,
 915                                    gimple_assign_rhs1_ptr (stmt),
 916                                    gimple_assign_rhs2_ptr (stmt));
 917               else if (code == COND_EXPR
 918                        && CONSTANT_CLASS_P (gimple_assign_rhs2 (stmt)))
 919                 {
 920                   tree cond_expr = gimple_assign_rhs1 (stmt);
 921                   enum tree_code cond_code = TREE_CODE (cond_expr);
 922
 923                   if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
 924                     {
 925                       bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr,
 926                                                                   0));
 927                       cond_code = invert_tree_comparison (cond_code,
 928                                                           honor_nans);
 929                       if (cond_code != ERROR_MARK)
 930                         {
 931                           TREE_SET_CODE (cond_expr, cond_code);
 932                           swap_ssa_operands (stmt,
 933                                              gimple_assign_rhs2_ptr (stmt),
 934                                              gimple_assign_rhs3_ptr (stmt));
 935                         }
 936                     }
 937                 }
 938             }
 939
 940           /* Free stmt_vec_info.  */
 941           free_stmt_vec_info (stmt);
 942           gsi_next (&si);
 943         }
 944     }
 945
 946   free (bbs);
 947
 948   release_vec_loop_masks (&masks);
 949   delete ivexpr_map;
 950
 951   loop->aux = NULL;
 952 }
 953
 954 /* Return an invariant or register for EXPR and emit necessary
 955    computations in the LOOP_VINFO loop preheader.  */
 956
 957 tree
 958 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
 959 {
 960   if (is_gimple_reg (expr)
 961       || is_gimple_min_invariant (expr))
 962     return expr;
 963
 964   if (! loop_vinfo->ivexpr_map)
 965     loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
 966   tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
 967   if (! cached)
 968     {
 969       gimple_seq stmts = NULL;
 970       cached = force_gimple_operand (unshare_expr (expr),
 971                                      &stmts, true, NULL_TREE);
 972       if (stmts)
 973         {
 974           edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
 975           gsi_insert_seq_on_edge_immediate (e, stmts);
 976         }
 977     }
 978   return cached;
 979 }
 980
 981 /* Return true if we can use CMP_TYPE as the comparison type to produce
 982    all masks required to mask LOOP_VINFO.  */
 983
 984 static bool
 985 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
 986 {
 987   rgroup_masks *rgm;
 988   unsigned int i;
 989   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
 990     if (rgm->mask_type != NULL_TREE
 991         && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
 992                                             cmp_type, rgm->mask_type,
 993                                             OPTIMIZE_FOR_SPEED))
 994       return false;
 995   return true;
 996 }
 997
 998 /* Calculate the maximum number of scalars per iteration for every
 999    rgroup in LOOP_VINFO.  */
1000
1001 static unsigned int
1002 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1003 {
1004   unsigned int res = 1;
1005   unsigned int i;
1006   rgroup_masks *rgm;
1007   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1008     res = MAX (res, rgm->max_nscalars_per_iter);
1009   return res;
1010 }
1011
1012 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1013    whether we can actually generate the masks required.  Return true if so,
1014    storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE.  */
1015
1016 static bool
1017 vect_verify_full_masking (loop_vec_info loop_vinfo)
1018 {
1019   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1020   unsigned int min_ni_width;
1021
1022   /* Use a normal loop if there are no statements that need masking.
1023      This only happens in rare degenerate cases: it means that the loop
1024      has no loads, no stores, and no live-out values.  */
1025   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1026     return false;
1027
1028   /* Get the maximum number of iterations that is representable
1029      in the counter type.  */
1030   tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1031   widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1032
1033   /* Get a more refined estimate for the number of iterations.  */
1034   widest_int max_back_edges;
1035   if (max_loop_iterations (loop, &max_back_edges))
1036     max_ni = wi::smin (max_ni, max_back_edges + 1);
1037
1038   /* Account for rgroup masks, in which each bit is replicated N times.  */
1039   max_ni *= vect_get_max_nscalars_per_iter (loop_vinfo);
1040
1041   /* Work out how many bits we need to represent the limit.  */
1042   min_ni_width = wi::min_precision (max_ni, UNSIGNED);
1043
1044   /* Find a scalar mode for which WHILE_ULT is supported.  */
1045   opt_scalar_int_mode cmp_mode_iter;
1046   tree cmp_type = NULL_TREE;
1047   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1048     {
1049       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1050       if (cmp_bits >= min_ni_width
1051           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1052         {
1053           tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1054           if (this_type
1055               && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1056             {
1057               /* Although we could stop as soon as we find a valid mode,
1058                  it's often better to continue until we hit Pmode, since the
1059                  operands to the WHILE are more likely to be reusable in
1060                  address calculations.  */
1061               cmp_type = this_type;
1062               if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1063                 break;
1064             }
1065         }
1066     }
1067
1068   if (!cmp_type)
1069     return false;
1070
1071   LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
1072   return true;
1073 }
1074
1075 /* Calculate the cost of one scalar iteration of the loop.  */
1076 static void
1077 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1078 {
1079   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1080   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1081   int nbbs = loop->num_nodes, factor;
1082   int innerloop_iters, i;
1083
1084   /* Gather costs for statements in the scalar loop.  */
1085
1086   /* FORNOW.  */
1087   innerloop_iters = 1;
1088   if (loop->inner)
1089     innerloop_iters = 50; /* FIXME */
1090
1091   for (i = 0; i < nbbs; i++)
1092     {
1093       gimple_stmt_iterator si;
1094       basic_block bb = bbs[i];
1095
1096       if (bb->loop_father == loop->inner)
1097         factor = innerloop_iters;
1098       else
1099         factor = 1;
1100
1101       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1102         {
1103           gimple *stmt = gsi_stmt (si);
1104           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1105
1106           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1107             continue;
1108
1109           /* Skip stmts that are not vectorized inside the loop.  */
1110           if (stmt_info
1111               && !STMT_VINFO_RELEVANT_P (stmt_info)
1112               && (!STMT_VINFO_LIVE_P (stmt_info)
1113                   || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1114               && !STMT_VINFO_IN_PATTERN_P (stmt_info))
1115             continue;
1116
1117           vect_cost_for_stmt kind;
1118           if (STMT_VINFO_DATA_REF (stmt_info))
1119             {
1120               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1121                kind = scalar_load;
1122              else
1123                kind = scalar_store;
1124             }
1125           else
1126             kind = scalar_stmt;
1127
1128           record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1129                             factor, kind, stmt_info, 0, vect_prologue);
1130         }
1131     }
1132
1133   /* Now accumulate cost.  */
1134   void *target_cost_data = init_cost (loop);
1135   stmt_info_for_cost *si;
1136   int j;
1137   FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1138                     j, si)
1139     {
1140       struct _stmt_vec_info *stmt_info
1141         = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
1142       (void) add_stmt_cost (target_cost_data, si->count,
1143                             si->kind, stmt_info, si->misalign,
1144                             vect_body);
1145     }
1146   unsigned dummy, body_cost = 0;
1147   finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
1148   destroy_cost_data (target_cost_data);
1149   LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
1150 }
1151
1152
1153 /* Function vect_analyze_loop_form_1.
1154
1155    Verify that certain CFG restrictions hold, including:
1156    - the loop has a pre-header
1157    - the loop has a single entry and exit
1158    - the loop exit condition is simple enough
1159    - the number of iterations can be analyzed, i.e, a countable loop.  The
1160      niter could be analyzed under some assumptions.  */
1161
1162 bool
1163 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1164                           tree *assumptions, tree *number_of_iterationsm1,
1165                           tree *number_of_iterations, gcond **inner_loop_cond)
1166 {
1167   DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1168
1169   /* Different restrictions apply when we are considering an inner-most loop,
1170      vs. an outer (nested) loop.
1171      (FORNOW. May want to relax some of these restrictions in the future).  */
1172
1173   if (!loop->inner)
1174     {
1175       /* Inner-most loop.  We currently require that the number of BBs is
1176          exactly 2 (the header and latch).  Vectorizable inner-most loops
1177          look like this:
1178
1179                         (pre-header)
1180                            |
1181                           header <--------+
1182                            | |            |
1183                            | +--> latch --+
1184                            |
1185                         (exit-bb)  */
1186
1187       if (loop->num_nodes != 2)
1188         {
1189           if (dump_enabled_p ())
1190             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1191                              "not vectorized: control flow in loop.\n");
1192           return false;
1193         }
1194
1195       if (empty_block_p (loop->header))
1196         {
1197           if (dump_enabled_p ())
1198             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1199                              "not vectorized: empty loop.\n");
1200           return false;
1201         }
1202     }
1203   else
1204     {
1205       struct loop *innerloop = loop->inner;
1206       edge entryedge;
1207
1208       /* Nested loop. We currently require that the loop is doubly-nested,
1209          contains a single inner loop, and the number of BBs is exactly 5.
1210          Vectorizable outer-loops look like this:
1211
1212                         (pre-header)
1213                            |
1214                           header <---+
1215                            |         |
1216                           inner-loop |
1217                            |         |
1218                           tail ------+
1219                            |
1220                         (exit-bb)
1221
1222          The inner-loop has the properties expected of inner-most loops
1223          as described above.  */
1224
1225       if ((loop->inner)->inner || (loop->inner)->next)
1226         {
1227           if (dump_enabled_p ())
1228             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1229                              "not vectorized: multiple nested loops.\n");
1230           return false;
1231         }
1232
1233       if (loop->num_nodes != 5)
1234         {
1235           if (dump_enabled_p ())
1236             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1237                              "not vectorized: control flow in loop.\n");
1238           return false;
1239         }
1240
1241       entryedge = loop_preheader_edge (innerloop);
1242       if (entryedge->src != loop->header
1243           || !single_exit (innerloop)
1244           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1245         {
1246           if (dump_enabled_p ())
1247             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1248                              "not vectorized: unsupported outerloop form.\n");
1249           return false;
1250         }
1251
1252       /* Analyze the inner-loop.  */
1253       tree inner_niterm1, inner_niter, inner_assumptions;
1254       if (! vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1255                                       &inner_assumptions, &inner_niterm1,
1256                                       &inner_niter, NULL)
1257           /* Don't support analyzing niter under assumptions for inner
1258              loop.  */
1259           || !integer_onep (inner_assumptions))
1260         {
1261           if (dump_enabled_p ())
1262             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1263                              "not vectorized: Bad inner loop.\n");
1264           return false;
1265         }
1266
1267       if (!expr_invariant_in_loop_p (loop, inner_niter))
1268         {
1269           if (dump_enabled_p ())
1270             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1271                              "not vectorized: inner-loop count not"
1272                              " invariant.\n");
1273           return false;
1274         }
1275
1276       if (dump_enabled_p ())
1277         dump_printf_loc (MSG_NOTE, vect_location,
1278                          "Considering outer-loop vectorization.\n");
1279     }
1280
1281   if (!single_exit (loop)
1282       || EDGE_COUNT (loop->header->preds) != 2)
1283     {
1284       if (dump_enabled_p ())
1285         {
1286           if (!single_exit (loop))
1287             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1288                              "not vectorized: multiple exits.\n");
1289           else if (EDGE_COUNT (loop->header->preds) != 2)
1290             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1291                              "not vectorized: too many incoming edges.\n");
1292         }
1293       return false;
1294     }
1295
1296   /* We assume that the loop exit condition is at the end of the loop. i.e,
1297      that the loop is represented as a do-while (with a proper if-guard
1298      before the loop if needed), where the loop header contains all the
1299      executable statements, and the latch is empty.  */
1300   if (!empty_block_p (loop->latch)
1301       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1302     {
1303       if (dump_enabled_p ())
1304         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1305                          "not vectorized: latch block not empty.\n");
1306       return false;
1307     }
1308
1309   /* Make sure the exit is not abnormal.  */
1310   edge e = single_exit (loop);
1311   if (e->flags & EDGE_ABNORMAL)
1312     {
1313       if (dump_enabled_p ())
1314         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1315                          "not vectorized: abnormal loop exit edge.\n");
1316       return false;
1317     }
1318
1319   *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1320                                      number_of_iterationsm1);
1321   if (!*loop_cond)
1322     {
1323       if (dump_enabled_p ())
1324         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1325                          "not vectorized: complicated exit condition.\n");
1326       return false;
1327     }
1328
1329   if (integer_zerop (*assumptions)
1330       || !*number_of_iterations
1331       || chrec_contains_undetermined (*number_of_iterations))
1332     {
1333       if (dump_enabled_p ())
1334         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1335                          "not vectorized: number of iterations cannot be "
1336                          "computed.\n");
1337       return false;
1338     }
1339
1340   if (integer_zerop (*number_of_iterations))
1341     {
1342       if (dump_enabled_p ())
1343         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1344                          "not vectorized: number of iterations = 0.\n");
1345       return false;
1346     }
1347
1348   return true;
1349 }
1350
1351 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form.  */
1352
1353 loop_vec_info
1354 vect_analyze_loop_form (struct loop *loop, vec_info_shared *shared)
1355 {
1356   tree assumptions, number_of_iterations, number_of_iterationsm1;
1357   gcond *loop_cond, *inner_loop_cond = NULL;
1358
1359   if (! vect_analyze_loop_form_1 (loop, &loop_cond,
1360                                   &assumptions, &number_of_iterationsm1,
1361                                   &number_of_iterations, &inner_loop_cond))
1362     return NULL;
1363
1364   loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1365   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1366   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1367   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1368   if (!integer_onep (assumptions))
1369     {
1370       /* We consider to vectorize this loop by versioning it under
1371          some assumptions.  In order to do this, we need to clear
1372          existing information computed by scev and niter analyzer.  */
1373       scev_reset_htab ();
1374       free_numbers_of_iterations_estimates (loop);
1375       /* Also set flag for this loop so that following scev and niter
1376          analysis are done under the assumptions.  */
1377       loop_constraint_set (loop, LOOP_C_FINITE);
1378       /* Also record the assumptions for versioning.  */
1379       LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1380     }
1381
1382   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1383     {
1384       if (dump_enabled_p ())
1385         {
1386           dump_printf_loc (MSG_NOTE, vect_location,
1387                            "Symbolic number of iterations is ");
1388           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1389           dump_printf (MSG_NOTE, "\n");
1390         }
1391     }
1392
1393   STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1394   if (inner_loop_cond)
1395     STMT_VINFO_TYPE (vinfo_for_stmt (inner_loop_cond))
1396       = loop_exit_ctrl_vec_info_type;
1397
1398   gcc_assert (!loop->aux);
1399   loop->aux = loop_vinfo;
1400   return loop_vinfo;
1401 }
1402
1403
1404
1405 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1406    statements update the vectorization factor.  */
1407
1408 static void
1409 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1410 {
1411   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1412   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1413   int nbbs = loop->num_nodes;
1414   poly_uint64 vectorization_factor;
1415   int i;
1416
1417   DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1418
1419   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1420   gcc_assert (known_ne (vectorization_factor, 0U));
1421
1422   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1423      vectorization factor of the loop is the unrolling factor required by
1424      the SLP instances.  If that unrolling factor is 1, we say, that we
1425      perform pure SLP on loop - cross iteration parallelism is not
1426      exploited.  */
1427   bool only_slp_in_loop = true;
1428   for (i = 0; i < nbbs; i++)
1429     {
1430       basic_block bb = bbs[i];
1431       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1432            gsi_next (&si))
1433         {
1434           gimple *stmt = gsi_stmt (si);
1435           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1436           if (STMT_VINFO_IN_PATTERN_P (stmt_info)
1437               && STMT_VINFO_RELATED_STMT (stmt_info))
1438             {
1439               stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1440               stmt_info = vinfo_for_stmt (stmt);
1441             }
1442           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1443                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1444               && !PURE_SLP_STMT (stmt_info))
1445             /* STMT needs both SLP and loop-based vectorization.  */
1446             only_slp_in_loop = false;
1447         }
1448     }
1449
1450   if (only_slp_in_loop)
1451     {
1452       dump_printf_loc (MSG_NOTE, vect_location,
1453                        "Loop contains only SLP stmts\n");
1454       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1455     }
1456   else
1457     {
1458       dump_printf_loc (MSG_NOTE, vect_location,
1459                        "Loop contains SLP and non-SLP stmts\n");
1460       /* Both the vectorization factor and unroll factor have the form
1461          current_vector_size * X for some rational X, so they must have
1462          a common multiple.  */
1463       vectorization_factor
1464         = force_common_multiple (vectorization_factor,
1465                                  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1466     }
1467
1468   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1469   if (dump_enabled_p ())
1470     {
1471       dump_printf_loc (MSG_NOTE, vect_location,
1472                        "Updating vectorization factor to ");
1473       dump_dec (MSG_NOTE, vectorization_factor);
1474       dump_printf (MSG_NOTE, ".\n");
1475     }
1476 }
1477
1478 /* Return true if STMT_INFO describes a double reduction phi and if
1479    the other phi in the reduction is also relevant for vectorization.
1480    This rejects cases such as:
1481
1482       outer1:
1483         x_1 = PHI <x_3(outer2), ...>;
1484         ...
1485
1486       inner:
1487         x_2 = ...;
1488         ...
1489
1490       outer2:
1491         x_3 = PHI <x_2(inner)>;
1492
1493    if nothing in x_2 or elsewhere makes x_1 relevant.  */
1494
1495 static bool
1496 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1497 {
1498   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1499     return false;
1500
1501   gimple *other_phi = STMT_VINFO_REDUC_DEF (stmt_info);
1502   return STMT_VINFO_RELEVANT_P (vinfo_for_stmt (other_phi));
1503 }
1504
1505 /* Function vect_analyze_loop_operations.
1506
1507    Scan the loop stmts and make sure they are all vectorizable.  */
1508
1509 static bool
1510 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1511 {
1512   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1513   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1514   int nbbs = loop->num_nodes;
1515   int i;
1516   stmt_vec_info stmt_info;
1517   bool need_to_vectorize = false;
1518   bool ok;
1519
1520   DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1521
1522   stmt_vector_for_cost cost_vec;
1523   cost_vec.create (2);
1524
1525   for (i = 0; i < nbbs; i++)
1526     {
1527       basic_block bb = bbs[i];
1528
1529       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1530            gsi_next (&si))
1531         {
1532           gphi *phi = si.phi ();
1533           ok = true;
1534
1535           stmt_info = vinfo_for_stmt (phi);
1536           if (dump_enabled_p ())
1537             {
1538               dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1539               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1540             }
1541           if (virtual_operand_p (gimple_phi_result (phi)))
1542             continue;
1543
1544           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1545              (i.e., a phi in the tail of the outer-loop).  */
1546           if (! is_loop_header_bb_p (bb))
1547             {
1548               /* FORNOW: we currently don't support the case that these phis
1549                  are not used in the outerloop (unless it is double reduction,
1550                  i.e., this phi is vect_reduction_def), cause this case
1551                  requires to actually do something here.  */
1552               if (STMT_VINFO_LIVE_P (stmt_info)
1553                   && !vect_active_double_reduction_p (stmt_info))
1554                 {
1555                   if (dump_enabled_p ())
1556                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1557                                      "Unsupported loop-closed phi in "
1558                                      "outer-loop.\n");
1559                   return false;
1560                 }
1561
1562               /* If PHI is used in the outer loop, we check that its operand
1563                  is defined in the inner loop.  */
1564               if (STMT_VINFO_RELEVANT_P (stmt_info))
1565                 {
1566                   tree phi_op;
1567                   gimple *op_def_stmt;
1568
1569                   if (gimple_phi_num_args (phi) != 1)
1570                     return false;
1571
1572                   phi_op = PHI_ARG_DEF (phi, 0);
1573                   if (TREE_CODE (phi_op) != SSA_NAME)
1574                     return false;
1575
1576                   op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1577                   if (gimple_nop_p (op_def_stmt)
1578                       || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1579                       || !vinfo_for_stmt (op_def_stmt))
1580                     return false;
1581
1582                   if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1583                         != vect_used_in_outer
1584                       && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1585                            != vect_used_in_outer_by_reduction)
1586                     return false;
1587                 }
1588
1589               continue;
1590             }
1591
1592           gcc_assert (stmt_info);
1593
1594           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1595                || STMT_VINFO_LIVE_P (stmt_info))
1596               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1597             {
1598               /* A scalar-dependence cycle that we don't support.  */
1599               if (dump_enabled_p ())
1600                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1601                                  "not vectorized: scalar dependence cycle.\n");
1602               return false;
1603             }
1604
1605           if (STMT_VINFO_RELEVANT_P (stmt_info))
1606             {
1607               need_to_vectorize = true;
1608               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1609                   && ! PURE_SLP_STMT (stmt_info))
1610                 ok = vectorizable_induction (phi, NULL, NULL, NULL, &cost_vec);
1611               else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1612                         || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1613                        && ! PURE_SLP_STMT (stmt_info))
1614                 ok = vectorizable_reduction (phi, NULL, NULL, NULL, NULL,
1615                                              &cost_vec);
1616             }
1617
1618           /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
1619           if (ok
1620               && STMT_VINFO_LIVE_P (stmt_info)
1621               && !PURE_SLP_STMT (stmt_info))
1622             ok = vectorizable_live_operation (phi, NULL, NULL, -1, NULL,
1623                                               &cost_vec);
1624
1625           if (!ok)
1626             {
1627               if (dump_enabled_p ())
1628                 {
1629                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1630                                    "not vectorized: relevant phi not "
1631                                    "supported: ");
1632                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1633                 }
1634               return false;
1635             }
1636         }
1637
1638       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1639            gsi_next (&si))
1640         {
1641           gimple *stmt = gsi_stmt (si);
1642           if (!gimple_clobber_p (stmt)
1643               && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL, NULL,
1644                                      &cost_vec))
1645             return false;
1646         }
1647     } /* bbs */
1648
1649   add_stmt_costs (loop_vinfo->target_cost_data, &cost_vec);
1650   cost_vec.release ();
1651
1652   /* All operations in the loop are either irrelevant (deal with loop
1653      control, or dead), or only used outside the loop and can be moved
1654      out of the loop (e.g. invariants, inductions).  The loop can be
1655      optimized away by scalar optimizations.  We're better off not
1656      touching this loop.  */
1657   if (!need_to_vectorize)
1658     {
1659       if (dump_enabled_p ())
1660         dump_printf_loc (MSG_NOTE, vect_location,
1661                          "All the computation can be taken out of the loop.\n");
1662       if (dump_enabled_p ())
1663         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1664                          "not vectorized: redundant loop. no profit to "
1665                          "vectorize.\n");
1666       return false;
1667     }
1668
1669   return true;
1670 }
1671
1672 /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
1673    is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
1674    definitely no, or -1 if it's worth retrying.  */
1675
1676 static int
1677 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1678 {
1679   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1680   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1681
1682   /* Only fully-masked loops can have iteration counts less than the
1683      vectorization factor.  */
1684   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1685     {
1686       HOST_WIDE_INT max_niter;
1687
1688       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1689         max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1690       else
1691         max_niter = max_stmt_executions_int (loop);
1692
1693       if (max_niter != -1
1694           && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1695         {
1696           if (dump_enabled_p ())
1697             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1698                              "not vectorized: iteration count smaller than "
1699                              "vectorization factor.\n");
1700           return 0;
1701         }
1702     }
1703
1704   int min_profitable_iters, min_profitable_estimate;
1705   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1706                                       &min_profitable_estimate);
1707
1708   if (min_profitable_iters < 0)
1709     {
1710       if (dump_enabled_p ())
1711         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1712                          "not vectorized: vectorization not profitable.\n");
1713       if (dump_enabled_p ())
1714         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1715                          "not vectorized: vector version will never be "
1716                          "profitable.\n");
1717       return -1;
1718     }
1719
1720   int min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1721                                * assumed_vf);
1722
1723   /* Use the cost model only if it is more conservative than user specified
1724      threshold.  */
1725   unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1726                                     min_profitable_iters);
1727
1728   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1729
1730   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1731       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1732     {
1733       if (dump_enabled_p ())
1734         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1735                          "not vectorized: vectorization not profitable.\n");
1736       if (dump_enabled_p ())
1737         dump_printf_loc (MSG_NOTE, vect_location,
1738                          "not vectorized: iteration count smaller than user "
1739                          "specified loop bound parameter or minimum profitable "
1740                          "iterations (whichever is more conservative).\n");
1741       return 0;
1742     }
1743
1744   HOST_WIDE_INT estimated_niter = estimated_stmt_executions_int (loop);
1745   if (estimated_niter == -1)
1746     estimated_niter = likely_max_stmt_executions_int (loop);
1747   if (estimated_niter != -1
1748       && ((unsigned HOST_WIDE_INT) estimated_niter
1749           < MAX (th, (unsigned) min_profitable_estimate)))
1750     {
1751       if (dump_enabled_p ())
1752         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1753                          "not vectorized: estimated iteration count too "
1754                          "small.\n");
1755       if (dump_enabled_p ())
1756         dump_printf_loc (MSG_NOTE, vect_location,
1757                          "not vectorized: estimated iteration count smaller "
1758                          "than specified loop bound parameter or minimum "
1759                          "profitable iterations (whichever is more "
1760                          "conservative).\n");
1761       return -1;
1762     }
1763
1764   return 1;
1765 }
1766
1767 static bool
1768 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1769                            vec<data_reference_p> *datarefs,
1770                            unsigned int *n_stmts)
1771 {
1772   *n_stmts = 0;
1773   for (unsigned i = 0; i < loop->num_nodes; i++)
1774     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1775          !gsi_end_p (gsi); gsi_next (&gsi))
1776       {
1777         gimple *stmt = gsi_stmt (gsi);
1778         if (is_gimple_debug (stmt))
1779           continue;
1780         ++(*n_stmts);
1781         if (!vect_find_stmt_data_reference (loop, stmt, datarefs))
1782           {
1783             if (is_gimple_call (stmt) && loop->safelen)
1784               {
1785                 tree fndecl = gimple_call_fndecl (stmt), op;
1786                 if (fndecl != NULL_TREE)
1787                   {
1788                     cgraph_node *node = cgraph_node::get (fndecl);
1789                     if (node != NULL && node->simd_clones != NULL)
1790                       {
1791                         unsigned int j, n = gimple_call_num_args (stmt);
1792                         for (j = 0; j < n; j++)
1793                           {
1794                             op = gimple_call_arg (stmt, j);
1795                             if (DECL_P (op)
1796                                 || (REFERENCE_CLASS_P (op)
1797                                     && get_base_address (op)))
1798                               break;
1799                           }
1800                         op = gimple_call_lhs (stmt);
1801                         /* Ignore #pragma omp declare simd functions
1802                            if they don't have data references in the
1803                            call stmt itself.  */
1804                         if (j == n
1805                             && !(op
1806                                  && (DECL_P (op)
1807                                      || (REFERENCE_CLASS_P (op)
1808                                          && get_base_address (op)))))
1809                           continue;
1810                       }
1811                   }
1812               }
1813             return false;
1814           }
1815         /* If dependence analysis will give up due to the limit on the
1816            number of datarefs stop here and fail fatally.  */
1817         if (datarefs->length ()
1818             > (unsigned)PARAM_VALUE (PARAM_LOOP_MAX_DATAREFS_FOR_DATADEPS))
1819           return false;
1820       }
1821   return true;
1822 }
1823
1824 /* Function vect_analyze_loop_2.
1825
1826    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1827    for it.  The different analyses will record information in the
1828    loop_vec_info struct.  */
1829 static bool
1830 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
1831 {
1832   bool ok;
1833   int res;
1834   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
1835   poly_uint64 min_vf = 2;
1836
1837   /* The first group of checks is independent of the vector size.  */
1838   fatal = true;
1839
1840   /* Find all data references in the loop (which correspond to vdefs/vuses)
1841      and analyze their evolution in the loop.  */
1842
1843   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1844
1845   /* Gather the data references and count stmts in the loop.  */
1846   if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
1847     {
1848       if (!vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
1849                                       &LOOP_VINFO_DATAREFS (loop_vinfo),
1850                                       n_stmts))
1851         {
1852           if (dump_enabled_p ())
1853             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1854                              "not vectorized: loop contains function "
1855                              "calls or data references that cannot "
1856                              "be analyzed\n");
1857           return false;
1858         }
1859       loop_vinfo->shared->save_datarefs ();
1860     }
1861   else
1862     loop_vinfo->shared->check_datarefs ();
1863
1864   /* Analyze the data references and also adjust the minimal
1865      vectorization factor according to the loads and stores.  */
1866
1867   ok = vect_analyze_data_refs (loop_vinfo, &min_vf);
1868   if (!ok)
1869     {
1870       if (dump_enabled_p ())
1871         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1872                          "bad data references.\n");
1873       return false;
1874     }
1875
1876   /* Classify all cross-iteration scalar data-flow cycles.
1877      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1878   vect_analyze_scalar_cycles (loop_vinfo);
1879
1880   vect_pattern_recog (loop_vinfo);
1881
1882   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1883
1884   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1885      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1886
1887   ok = vect_analyze_data_ref_accesses (loop_vinfo);
1888   if (!ok)
1889     {
1890       if (dump_enabled_p ())
1891         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1892                          "bad data access.\n");
1893       return false;
1894     }
1895
1896   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1897
1898   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1899   if (!ok)
1900     {
1901       if (dump_enabled_p ())
1902         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1903                          "unexpected pattern.\n");
1904       return false;
1905     }
1906
1907   /* While the rest of the analysis below depends on it in some way.  */
1908   fatal = false;
1909
1910   /* Analyze data dependences between the data-refs in the loop
1911      and adjust the maximum vectorization factor according to
1912      the dependences.
1913      FORNOW: fail at the first data dependence that we encounter.  */
1914
1915   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1916   if (!ok
1917       || (max_vf != MAX_VECTORIZATION_FACTOR
1918           && maybe_lt (max_vf, min_vf)))
1919     {
1920       if (dump_enabled_p ())
1921             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1922                              "bad data dependence.\n");
1923       return false;
1924     }
1925   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
1926
1927   ok = vect_determine_vectorization_factor (loop_vinfo);
1928   if (!ok)
1929     {
1930       if (dump_enabled_p ())
1931         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1932                          "can't determine vectorization factor.\n");
1933       return false;
1934     }
1935   if (max_vf != MAX_VECTORIZATION_FACTOR
1936       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1937     {
1938       if (dump_enabled_p ())
1939         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1940                          "bad data dependence.\n");
1941       return false;
1942     }
1943
1944   /* Compute the scalar iteration cost.  */
1945   vect_compute_single_scalar_iteration_cost (loop_vinfo);
1946
1947   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1948   unsigned th;
1949
1950   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
1951   ok = vect_analyze_slp (loop_vinfo, *n_stmts);
1952   if (!ok)
1953     return false;
1954
1955   /* If there are any SLP instances mark them as pure_slp.  */
1956   bool slp = vect_make_slp_decision (loop_vinfo);
1957   if (slp)
1958     {
1959       /* Find stmts that need to be both vectorized and SLPed.  */
1960       vect_detect_hybrid_slp (loop_vinfo);
1961
1962       /* Update the vectorization factor based on the SLP decision.  */
1963       vect_update_vf_for_slp (loop_vinfo);
1964     }
1965
1966   bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo);
1967
1968   /* We don't expect to have to roll back to anything other than an empty
1969      set of rgroups.  */
1970   gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
1971
1972   /* This is the point where we can re-start analysis with SLP forced off.  */
1973 start_over:
1974
1975   /* Now the vectorization factor is final.  */
1976   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1977   gcc_assert (known_ne (vectorization_factor, 0U));
1978
1979   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1980     {
1981       dump_printf_loc (MSG_NOTE, vect_location,
1982                        "vectorization_factor = ");
1983       dump_dec (MSG_NOTE, vectorization_factor);
1984       dump_printf (MSG_NOTE, ", niters = " HOST_WIDE_INT_PRINT_DEC "\n",
1985                    LOOP_VINFO_INT_NITERS (loop_vinfo));
1986     }
1987
1988   HOST_WIDE_INT max_niter
1989     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1990
1991   /* Analyze the alignment of the data-refs in the loop.
1992      Fail if a data reference is found that cannot be vectorized.  */
1993
1994   ok = vect_analyze_data_refs_alignment (loop_vinfo);
1995   if (!ok)
1996     {
1997       if (dump_enabled_p ())
1998         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1999                          "bad data alignment.\n");
2000       return false;
2001     }
2002
2003   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2004      It is important to call pruning after vect_analyze_data_ref_accesses,
2005      since we use grouping information gathered by interleaving analysis.  */
2006   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2007   if (!ok)
2008     return false;
2009
2010   /* Do not invoke vect_enhance_data_refs_alignment for eplilogue
2011      vectorization.  */
2012   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2013     {
2014     /* This pass will decide on using loop versioning and/or loop peeling in
2015        order to enhance the alignment of data references in the loop.  */
2016     ok = vect_enhance_data_refs_alignment (loop_vinfo);
2017     if (!ok)
2018       {
2019         if (dump_enabled_p ())
2020           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2021                            "bad data alignment.\n");
2022         return false;
2023       }
2024     }
2025
2026   if (slp)
2027     {
2028       /* Analyze operations in the SLP instances.  Note this may
2029          remove unsupported SLP instances which makes the above
2030          SLP kind detection invalid.  */
2031       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2032       vect_slp_analyze_operations (loop_vinfo);
2033       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2034         goto again;
2035     }
2036
2037   /* Scan all the remaining operations in the loop that are not subject
2038      to SLP and make sure they are vectorizable.  */
2039   ok = vect_analyze_loop_operations (loop_vinfo);
2040   if (!ok)
2041     {
2042       if (dump_enabled_p ())
2043         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2044                          "bad operation or unsupported loop bound.\n");
2045       return false;
2046     }
2047
2048   /* Decide whether to use a fully-masked loop for this vectorization
2049      factor.  */
2050   LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2051     = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
2052        && vect_verify_full_masking (loop_vinfo));
2053   if (dump_enabled_p ())
2054     {
2055       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2056         dump_printf_loc (MSG_NOTE, vect_location,
2057                          "using a fully-masked loop.\n");
2058       else
2059         dump_printf_loc (MSG_NOTE, vect_location,
2060                          "not using a fully-masked loop.\n");
2061     }
2062
2063   /* If epilog loop is required because of data accesses with gaps,
2064      one additional iteration needs to be peeled.  Check if there is
2065      enough iterations for vectorization.  */
2066   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2067       && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2068       && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2069     {
2070       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2071       tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2072
2073       if (known_lt (wi::to_widest (scalar_niters), vf))
2074         {
2075           if (dump_enabled_p ())
2076             dump_printf_loc (MSG_NOTE, vect_location,
2077                              "loop has no enough iterations to support"
2078                              " peeling for gaps.\n");
2079           return false;
2080         }
2081     }
2082
2083   /* Check the costings of the loop make vectorizing worthwhile.  */
2084   res = vect_analyze_loop_costing (loop_vinfo);
2085   if (res < 0)
2086     goto again;
2087   if (!res)
2088     {
2089       if (dump_enabled_p ())
2090         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2091                          "Loop costings not worthwhile.\n");
2092       return false;
2093     }
2094
2095   /* Decide whether we need to create an epilogue loop to handle
2096      remaining scalar iterations.  */
2097   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2098
2099   unsigned HOST_WIDE_INT const_vf;
2100   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2101     /* The main loop handles all iterations.  */
2102     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2103   else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2104            && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
2105     {
2106       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo)
2107                        - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo),
2108                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2109         LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2110     }
2111   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2112            || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
2113            || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2114                 < (unsigned) exact_log2 (const_vf))
2115                /* In case of versioning, check if the maximum number of
2116                   iterations is greater than th.  If they are identical,
2117                   the epilogue is unnecessary.  */
2118                && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2119                    || ((unsigned HOST_WIDE_INT) max_niter
2120                        > (th / const_vf) * const_vf))))
2121     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2122
2123   /* If an epilogue loop is required make sure we can create one.  */
2124   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2125       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2126     {
2127       if (dump_enabled_p ())
2128         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2129       if (!vect_can_advance_ivs_p (loop_vinfo)
2130           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2131                                            single_exit (LOOP_VINFO_LOOP
2132                                                          (loop_vinfo))))
2133         {
2134           if (dump_enabled_p ())
2135             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2136                              "not vectorized: can't create required "
2137                              "epilog loop\n");
2138           goto again;
2139         }
2140     }
2141
2142   /* During peeling, we need to check if number of loop iterations is
2143      enough for both peeled prolog loop and vector loop.  This check
2144      can be merged along with threshold check of loop versioning, so
2145      increase threshold for this case if necessary.  */
2146   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
2147     {
2148       poly_uint64 niters_th = 0;
2149
2150       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2151         {
2152           /* Niters for peeled prolog loop.  */
2153           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2154             {
2155               struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2156               tree vectype
2157                 = STMT_VINFO_VECTYPE (vinfo_for_stmt (vect_dr_stmt (dr)));
2158               niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2159             }
2160           else
2161             niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2162         }
2163
2164       /* Niters for at least one iteration of vectorized loop.  */
2165       if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2166         niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2167       /* One additional iteration because of peeling for gap.  */
2168       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2169         niters_th += 1;
2170       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2171     }
2172
2173   gcc_assert (known_eq (vectorization_factor,
2174                         LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2175
2176   /* Ok to vectorize!  */
2177   return true;
2178
2179 again:
2180   /* Try again with SLP forced off but if we didn't do any SLP there is
2181      no point in re-trying.  */
2182   if (!slp)
2183     return false;
2184
2185   /* If there are reduction chains re-trying will fail anyway.  */
2186   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2187     return false;
2188
2189   /* Likewise if the grouped loads or stores in the SLP cannot be handled
2190      via interleaving or lane instructions.  */
2191   slp_instance instance;
2192   slp_tree node;
2193   unsigned i, j;
2194   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2195     {
2196       stmt_vec_info vinfo;
2197       vinfo = vinfo_for_stmt
2198           (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0]);
2199       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2200         continue;
2201       vinfo = vinfo_for_stmt (DR_GROUP_FIRST_ELEMENT (vinfo));
2202       unsigned int size = DR_GROUP_SIZE (vinfo);
2203       tree vectype = STMT_VINFO_VECTYPE (vinfo);
2204       if (! vect_store_lanes_supported (vectype, size, false)
2205          && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2206          && ! vect_grouped_store_supported (vectype, size))
2207        return false;
2208       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2209         {
2210           vinfo = vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]);
2211           vinfo = vinfo_for_stmt (DR_GROUP_FIRST_ELEMENT (vinfo));
2212           bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2213           size = DR_GROUP_SIZE (vinfo);
2214           vectype = STMT_VINFO_VECTYPE (vinfo);
2215           if (! vect_load_lanes_supported (vectype, size, false)
2216               && ! vect_grouped_load_supported (vectype, single_element_p,
2217                                                 size))
2218             return false;
2219         }
2220     }
2221
2222   if (dump_enabled_p ())
2223     dump_printf_loc (MSG_NOTE, vect_location,
2224                      "re-trying with SLP disabled\n");
2225
2226   /* Roll back state appropriately.  No SLP this time.  */
2227   slp = false;
2228   /* Restore vectorization factor as it were without SLP.  */
2229   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2230   /* Free the SLP instances.  */
2231   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2232     vect_free_slp_instance (instance);
2233   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2234   /* Reset SLP type to loop_vect on all stmts.  */
2235   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2236     {
2237       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2238       for (gimple_stmt_iterator si = gsi_start_phis (bb);
2239            !gsi_end_p (si); gsi_next (&si))
2240         {
2241           stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2242           STMT_SLP_TYPE (stmt_info) = loop_vect;
2243         }
2244       for (gimple_stmt_iterator si = gsi_start_bb (bb);
2245            !gsi_end_p (si); gsi_next (&si))
2246         {
2247           stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2248           STMT_SLP_TYPE (stmt_info) = loop_vect;
2249           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2250             {
2251               gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2252               stmt_info = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
2253               STMT_SLP_TYPE (stmt_info) = loop_vect;
2254               for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2255                    !gsi_end_p (pi); gsi_next (&pi))
2256                 {
2257                   gimple *pstmt = gsi_stmt (pi);
2258                   STMT_SLP_TYPE (vinfo_for_stmt (pstmt)) = loop_vect;
2259                 }
2260             }
2261         }
2262     }
2263   /* Free optimized alias test DDRS.  */
2264   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2265   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2266   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2267   /* Reset target cost data.  */
2268   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2269   LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2270     = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2271   /* Reset accumulated rgroup information.  */
2272   release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo));
2273   /* Reset assorted flags.  */
2274   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2275   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2276   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2277   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2278   LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p;
2279
2280   goto start_over;
2281 }
2282
2283 /* Function vect_analyze_loop.
2284
2285    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2286    for it.  The different analyses will record information in the
2287    loop_vec_info struct.  If ORIG_LOOP_VINFO is not NULL epilogue must
2288    be vectorized.  */
2289 loop_vec_info
2290 vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo,
2291                    vec_info_shared *shared)
2292 {
2293   loop_vec_info loop_vinfo;
2294   auto_vector_sizes vector_sizes;
2295
2296   /* Autodetect first vector size we try.  */
2297   current_vector_size = 0;
2298   targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
2299   unsigned int next_size = 0;
2300
2301   DUMP_VECT_SCOPE ("analyze_loop_nest");
2302
2303   if (loop_outer (loop)
2304       && loop_vec_info_for_loop (loop_outer (loop))
2305       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2306     {
2307       if (dump_enabled_p ())
2308         dump_printf_loc (MSG_NOTE, vect_location,
2309                          "outer-loop already vectorized.\n");
2310       return NULL;
2311     }
2312
2313   if (!find_loop_nest (loop, &shared->loop_nest))
2314     {
2315       if (dump_enabled_p ())
2316         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2317                          "not vectorized: loop nest containing two "
2318                          "or more consecutive inner loops cannot be "
2319                          "vectorized\n");
2320       return NULL;
2321     }
2322
2323   unsigned n_stmts = 0;
2324   poly_uint64 autodetected_vector_size = 0;
2325   while (1)
2326     {
2327       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
2328       loop_vinfo = vect_analyze_loop_form (loop, shared);
2329       if (!loop_vinfo)
2330         {
2331           if (dump_enabled_p ())
2332             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2333                              "bad loop form.\n");
2334           return NULL;
2335         }
2336
2337       bool fatal = false;
2338
2339       if (orig_loop_vinfo)
2340         LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo;
2341
2342       if (vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts))
2343         {
2344           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2345
2346           return loop_vinfo;
2347         }
2348
2349       delete loop_vinfo;
2350
2351       if (next_size == 0)
2352         autodetected_vector_size = current_vector_size;
2353
2354       if (next_size < vector_sizes.length ()
2355           && known_eq (vector_sizes[next_size], autodetected_vector_size))
2356         next_size += 1;
2357
2358       if (fatal
2359           || next_size == vector_sizes.length ()
2360           || known_eq (current_vector_size, 0U))
2361         return NULL;
2362
2363       /* Try the next biggest vector size.  */
2364       current_vector_size = vector_sizes[next_size++];
2365       if (dump_enabled_p ())
2366         {
2367           dump_printf_loc (MSG_NOTE, vect_location,
2368                            "***** Re-trying analysis with "
2369                            "vector size ");
2370           dump_dec (MSG_NOTE, current_vector_size);
2371           dump_printf (MSG_NOTE, "\n");
2372         }
2373     }
2374 }
2375
2376 /* Return true if there is an in-order reduction function for CODE, storing
2377    it in *REDUC_FN if so.  */
2378
2379 static bool
2380 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
2381 {
2382   switch (code)
2383     {
2384     case PLUS_EXPR:
2385       *reduc_fn = IFN_FOLD_LEFT_PLUS;
2386       return true;
2387
2388     default:
2389       return false;
2390     }
2391 }
2392
2393 /* Function reduction_fn_for_scalar_code
2394
2395    Input:
2396    CODE - tree_code of a reduction operations.
2397
2398    Output:
2399    REDUC_FN - the corresponding internal function to be used to reduce the
2400       vector of partial results into a single scalar result, or IFN_LAST
2401       if the operation is a supported reduction operation, but does not have
2402       such an internal function.
2403
2404    Return FALSE if CODE currently cannot be vectorized as reduction.  */
2405
2406 static bool
2407 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
2408 {
2409   switch (code)
2410     {
2411       case MAX_EXPR:
2412         *reduc_fn = IFN_REDUC_MAX;
2413         return true;
2414
2415       case MIN_EXPR:
2416         *reduc_fn = IFN_REDUC_MIN;
2417         return true;
2418
2419       case PLUS_EXPR:
2420         *reduc_fn = IFN_REDUC_PLUS;
2421         return true;
2422
2423       case BIT_AND_EXPR:
2424         *reduc_fn = IFN_REDUC_AND;
2425         return true;
2426
2427       case BIT_IOR_EXPR:
2428         *reduc_fn = IFN_REDUC_IOR;
2429         return true;
2430
2431       case BIT_XOR_EXPR:
2432         *reduc_fn = IFN_REDUC_XOR;
2433         return true;
2434
2435       case MULT_EXPR:
2436       case MINUS_EXPR:
2437         *reduc_fn = IFN_LAST;
2438         return true;
2439
2440       default:
2441        return false;
2442     }
2443 }
2444
2445 /* If there is a neutral value X such that SLP reduction NODE would not
2446    be affected by the introduction of additional X elements, return that X,
2447    otherwise return null.  CODE is the code of the reduction.  REDUC_CHAIN
2448    is true if the SLP statements perform a single reduction, false if each
2449    statement performs an independent reduction.  */
2450
2451 static tree
2452 neutral_op_for_slp_reduction (slp_tree slp_node, tree_code code,
2453                               bool reduc_chain)
2454 {
2455   vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2456   gimple *stmt = stmts[0];
2457   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2458   tree vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
2459   tree scalar_type = TREE_TYPE (vector_type);
2460   struct loop *loop = gimple_bb (stmt)->loop_father;
2461   gcc_assert (loop);
2462
2463   switch (code)
2464     {
2465     case WIDEN_SUM_EXPR:
2466     case DOT_PROD_EXPR:
2467     case SAD_EXPR:
2468     case PLUS_EXPR:
2469     case MINUS_EXPR:
2470     case BIT_IOR_EXPR:
2471     case BIT_XOR_EXPR:
2472       return build_zero_cst (scalar_type);
2473
2474     case MULT_EXPR:
2475       return build_one_cst (scalar_type);
2476
2477     case BIT_AND_EXPR:
2478       return build_all_ones_cst (scalar_type);
2479
2480     case MAX_EXPR:
2481     case MIN_EXPR:
2482       /* For MIN/MAX the initial values are neutral.  A reduction chain
2483          has only a single initial value, so that value is neutral for
2484          all statements.  */
2485       if (reduc_chain)
2486         return PHI_ARG_DEF_FROM_EDGE (stmt, loop_preheader_edge (loop));
2487       return NULL_TREE;
2488
2489     default:
2490       return NULL_TREE;
2491     }
2492 }
2493
2494 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
2495    STMT is printed with a message MSG. */
2496
2497 static void
2498 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2499 {
2500   dump_printf_loc (msg_type, vect_location, "%s", msg);
2501   dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
2502 }
2503
2504
2505 /* Detect SLP reduction of the form:
2506
2507    #a1 = phi <a5, a0>
2508    a2 = operation (a1)
2509    a3 = operation (a2)
2510    a4 = operation (a3)
2511    a5 = operation (a4)
2512
2513    #a = phi <a5>
2514
2515    PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2516    FIRST_STMT is the first reduction stmt in the chain
2517    (a2 = operation (a1)).
2518
2519    Return TRUE if a reduction chain was detected.  */
2520
2521 static bool
2522 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2523                        gimple *first_stmt)
2524 {
2525   struct loop *loop = (gimple_bb (phi))->loop_father;
2526   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2527   enum tree_code code;
2528   gimple *current_stmt = NULL, *loop_use_stmt = NULL, *first, *next_stmt;
2529   stmt_vec_info use_stmt_info, current_stmt_info;
2530   tree lhs;
2531   imm_use_iterator imm_iter;
2532   use_operand_p use_p;
2533   int nloop_uses, size = 0, n_out_of_loop_uses;
2534   bool found = false;
2535
2536   if (loop != vect_loop)
2537     return false;
2538
2539   lhs = PHI_RESULT (phi);
2540   code = gimple_assign_rhs_code (first_stmt);
2541   while (1)
2542     {
2543       nloop_uses = 0;
2544       n_out_of_loop_uses = 0;
2545       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2546         {
2547           gimple *use_stmt = USE_STMT (use_p);
2548           if (is_gimple_debug (use_stmt))
2549             continue;
2550
2551           /* Check if we got back to the reduction phi.  */
2552           if (use_stmt == phi)
2553             {
2554               loop_use_stmt = use_stmt;
2555               found = true;
2556               break;
2557             }
2558
2559           if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2560             {
2561               loop_use_stmt = use_stmt;
2562               nloop_uses++;
2563             }
2564            else
2565              n_out_of_loop_uses++;
2566
2567            /* There are can be either a single use in the loop or two uses in
2568               phi nodes.  */
2569            if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2570              return false;
2571         }
2572
2573       if (found)
2574         break;
2575
2576       /* We reached a statement with no loop uses.  */
2577       if (nloop_uses == 0)
2578         return false;
2579
2580       /* This is a loop exit phi, and we haven't reached the reduction phi.  */
2581       if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2582         return false;
2583
2584       if (!is_gimple_assign (loop_use_stmt)
2585           || code != gimple_assign_rhs_code (loop_use_stmt)
2586           || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2587         return false;
2588
2589       /* Insert USE_STMT into reduction chain.  */
2590       use_stmt_info = vinfo_for_stmt (loop_use_stmt);
2591       if (current_stmt)
2592         {
2593           current_stmt_info = vinfo_for_stmt (current_stmt);
2594           REDUC_GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
2595           REDUC_GROUP_FIRST_ELEMENT (use_stmt_info)
2596             = REDUC_GROUP_FIRST_ELEMENT (current_stmt_info);
2597         }
2598       else
2599         REDUC_GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
2600
2601       lhs = gimple_assign_lhs (loop_use_stmt);
2602       current_stmt = loop_use_stmt;
2603       size++;
2604    }
2605
2606   if (!found || loop_use_stmt != phi || size < 2)
2607     return false;
2608
2609   /* Swap the operands, if needed, to make the reduction operand be the second
2610      operand.  */
2611   lhs = PHI_RESULT (phi);
2612   next_stmt = REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2613   while (next_stmt)
2614     {
2615       if (gimple_assign_rhs2 (next_stmt) == lhs)
2616         {
2617           tree op = gimple_assign_rhs1 (next_stmt);
2618           gimple *def_stmt = NULL;
2619
2620           if (TREE_CODE (op) == SSA_NAME)
2621             def_stmt = SSA_NAME_DEF_STMT (op);
2622
2623           /* Check that the other def is either defined in the loop
2624              ("vect_internal_def"), or it's an induction (defined by a
2625              loop-header phi-node).  */
2626           if (def_stmt
2627               && gimple_bb (def_stmt)
2628               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2629               && (is_gimple_assign (def_stmt)
2630                   || is_gimple_call (def_stmt)
2631                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2632                            == vect_induction_def
2633                   || (gimple_code (def_stmt) == GIMPLE_PHI
2634                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2635                                   == vect_internal_def
2636                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2637             {
2638               lhs = gimple_assign_lhs (next_stmt);
2639               next_stmt = REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2640               continue;
2641             }
2642
2643           return false;
2644         }
2645       else
2646         {
2647           tree op = gimple_assign_rhs2 (next_stmt);
2648           gimple *def_stmt = NULL;
2649
2650           if (TREE_CODE (op) == SSA_NAME)
2651             def_stmt = SSA_NAME_DEF_STMT (op);
2652
2653           /* Check that the other def is either defined in the loop
2654             ("vect_internal_def"), or it's an induction (defined by a
2655             loop-header phi-node).  */
2656           if (def_stmt
2657               && gimple_bb (def_stmt)
2658               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2659               && (is_gimple_assign (def_stmt)
2660                   || is_gimple_call (def_stmt)
2661                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2662                               == vect_induction_def
2663                   || (gimple_code (def_stmt) == GIMPLE_PHI
2664                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2665                                   == vect_internal_def
2666                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2667             {
2668               if (dump_enabled_p ())
2669                 {
2670                   dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2671                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2672                 }
2673
2674               swap_ssa_operands (next_stmt,
2675                                  gimple_assign_rhs1_ptr (next_stmt),
2676                                  gimple_assign_rhs2_ptr (next_stmt));
2677               update_stmt (next_stmt);
2678
2679               if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2680                 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2681             }
2682           else
2683             return false;
2684         }
2685
2686       lhs = gimple_assign_lhs (next_stmt);
2687       next_stmt = REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2688     }
2689
2690   /* Save the chain for further analysis in SLP detection.  */
2691   first = REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2692   LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2693   REDUC_GROUP_SIZE (vinfo_for_stmt (first)) = size;
2694
2695   return true;
2696 }
2697
2698 /* Return true if we need an in-order reduction for operation CODE
2699    on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
2700    overflow must wrap.  */
2701
2702 static bool
2703 needs_fold_left_reduction_p (tree type, tree_code code,
2704                              bool need_wrapping_integral_overflow)
2705 {
2706   /* CHECKME: check for !flag_finite_math_only too?  */
2707   if (SCALAR_FLOAT_TYPE_P (type))
2708     switch (code)
2709       {
2710       case MIN_EXPR:
2711       case MAX_EXPR:
2712         return false;
2713
2714       default:
2715         return !flag_associative_math;
2716       }
2717
2718   if (INTEGRAL_TYPE_P (type))
2719     {
2720       if (!operation_no_trapping_overflow (type, code))
2721         return true;
2722       if (need_wrapping_integral_overflow
2723           && !TYPE_OVERFLOW_WRAPS (type)
2724           && operation_can_overflow (code))
2725         return true;
2726       return false;
2727     }
2728
2729   if (SAT_FIXED_POINT_TYPE_P (type))
2730     return true;
2731
2732   return false;
2733 }
2734
2735 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
2736    reduction operation CODE has a handled computation expression.  */
2737
2738 bool
2739 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
2740                       tree loop_arg, enum tree_code code)
2741 {
2742   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
2743   auto_bitmap visited;
2744   tree lookfor = PHI_RESULT (phi);
2745   ssa_op_iter curri;
2746   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
2747   while (USE_FROM_PTR (curr) != loop_arg)
2748     curr = op_iter_next_use (&curri);
2749   curri.i = curri.numops;
2750   do
2751     {
2752       path.safe_push (std::make_pair (curri, curr));
2753       tree use = USE_FROM_PTR (curr);
2754       if (use == lookfor)
2755         break;
2756       gimple *def = SSA_NAME_DEF_STMT (use);
2757       if (gimple_nop_p (def)
2758           || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
2759         {
2760 pop:
2761           do
2762             {
2763               std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
2764               curri = x.first;
2765               curr = x.second;
2766               do
2767                 curr = op_iter_next_use (&curri);
2768               /* Skip already visited or non-SSA operands (from iterating
2769                  over PHI args).  */
2770               while (curr != NULL_USE_OPERAND_P
2771                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2772                          || ! bitmap_set_bit (visited,
2773                                               SSA_NAME_VERSION
2774                                                 (USE_FROM_PTR (curr)))));
2775             }
2776           while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
2777           if (curr == NULL_USE_OPERAND_P)
2778             break;
2779         }
2780       else
2781         {
2782           if (gimple_code (def) == GIMPLE_PHI)
2783             curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
2784           else
2785             curr = op_iter_init_use (&curri, def, SSA_OP_USE);
2786           while (curr != NULL_USE_OPERAND_P
2787                  && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2788                      || ! bitmap_set_bit (visited,
2789                                           SSA_NAME_VERSION
2790                                             (USE_FROM_PTR (curr)))))
2791             curr = op_iter_next_use (&curri);
2792           if (curr == NULL_USE_OPERAND_P)
2793             goto pop;
2794         }
2795     }
2796   while (1);
2797   if (dump_file && (dump_flags & TDF_DETAILS))
2798     {
2799       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
2800       unsigned i;
2801       std::pair<ssa_op_iter, use_operand_p> *x;
2802       FOR_EACH_VEC_ELT (path, i, x)
2803         {
2804           dump_generic_expr (MSG_NOTE, TDF_SLIM, USE_FROM_PTR (x->second));
2805           dump_printf (MSG_NOTE, " ");
2806         }
2807       dump_printf (MSG_NOTE, "\n");
2808     }
2809
2810   /* Check whether the reduction path detected is valid.  */
2811   bool fail = path.length () == 0;
2812   bool neg = false;
2813   for (unsigned i = 1; i < path.length (); ++i)
2814     {
2815       gimple *use_stmt = USE_STMT (path[i].second);
2816       tree op = USE_FROM_PTR (path[i].second);
2817       if (! has_single_use (op)
2818           || ! is_gimple_assign (use_stmt))
2819         {
2820           fail = true;
2821           break;
2822         }
2823       if (gimple_assign_rhs_code (use_stmt) != code)
2824         {
2825           if (code == PLUS_EXPR
2826               && gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
2827             {
2828               /* Track whether we negate the reduction value each iteration.  */
2829               if (gimple_assign_rhs2 (use_stmt) == op)
2830                 neg = ! neg;
2831             }
2832           else
2833             {
2834               fail = true;
2835               break;
2836             }
2837         }
2838     }
2839   return ! fail && ! neg;
2840 }
2841
2842
2843 /* Function vect_is_simple_reduction
2844
2845    (1) Detect a cross-iteration def-use cycle that represents a simple
2846    reduction computation.  We look for the following pattern:
2847
2848    loop_header:
2849      a1 = phi < a0, a2 >
2850      a3 = ...
2851      a2 = operation (a3, a1)
2852
2853    or
2854
2855    a3 = ...
2856    loop_header:
2857      a1 = phi < a0, a2 >
2858      a2 = operation (a3, a1)
2859
2860    such that:
2861    1. operation is commutative and associative and it is safe to
2862       change the order of the computation
2863    2. no uses for a2 in the loop (a2 is used out of the loop)
2864    3. no uses of a1 in the loop besides the reduction operation
2865    4. no uses of a1 outside the loop.
2866
2867    Conditions 1,4 are tested here.
2868    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2869
2870    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2871    nested cycles.
2872
2873    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2874    reductions:
2875
2876      a1 = phi < a0, a2 >
2877      inner loop (def of a3)
2878      a2 = phi < a3 >
2879
2880    (4) Detect condition expressions, ie:
2881      for (int i = 0; i < N; i++)
2882        if (a[i] < val)
2883         ret_val = a[i];
2884
2885 */
2886
2887 static gimple *
2888 vect_is_simple_reduction (loop_vec_info loop_info, gimple *phi,
2889                           bool *double_reduc,
2890                           bool need_wrapping_integral_overflow,
2891                           enum vect_reduction_type *v_reduc_type)
2892 {
2893   struct loop *loop = (gimple_bb (phi))->loop_father;
2894   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2895   gimple *def_stmt, *def1 = NULL, *def2 = NULL, *phi_use_stmt = NULL;
2896   enum tree_code orig_code, code;
2897   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2898   tree type;
2899   int nloop_uses;
2900   tree name;
2901   imm_use_iterator imm_iter;
2902   use_operand_p use_p;
2903   bool phi_def;
2904
2905   *double_reduc = false;
2906   *v_reduc_type = TREE_CODE_REDUCTION;
2907
2908   tree phi_name = PHI_RESULT (phi);
2909   /* ???  If there are no uses of the PHI result the inner loop reduction
2910      won't be detected as possibly double-reduction by vectorizable_reduction
2911      because that tries to walk the PHI arg from the preheader edge which
2912      can be constant.  See PR60382.  */
2913   if (has_zero_uses (phi_name))
2914     return NULL;
2915   nloop_uses = 0;
2916   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
2917     {
2918       gimple *use_stmt = USE_STMT (use_p);
2919       if (is_gimple_debug (use_stmt))
2920         continue;
2921
2922       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2923         {
2924           if (dump_enabled_p ())
2925             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2926                              "intermediate value used outside loop.\n");
2927
2928           return NULL;
2929         }
2930
2931       nloop_uses++;
2932       if (nloop_uses > 1)
2933         {
2934           if (dump_enabled_p ())
2935             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2936                              "reduction value used in loop.\n");
2937           return NULL;
2938         }
2939
2940       phi_use_stmt = use_stmt;
2941     }
2942
2943   edge latch_e = loop_latch_edge (loop);
2944   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2945   if (TREE_CODE (loop_arg) != SSA_NAME)
2946     {
2947       if (dump_enabled_p ())
2948         {
2949           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2950                            "reduction: not ssa_name: ");
2951           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2952           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2953         }
2954       return NULL;
2955     }
2956
2957   def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2958   if (is_gimple_assign (def_stmt))
2959     {
2960       name = gimple_assign_lhs (def_stmt);
2961       phi_def = false;
2962     }
2963   else if (gimple_code (def_stmt) == GIMPLE_PHI)
2964     {
2965       name = PHI_RESULT (def_stmt);
2966       phi_def = true;
2967     }
2968   else
2969     {
2970       if (dump_enabled_p ())
2971         {
2972           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2973                            "reduction: unhandled reduction operation: ");
2974           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, def_stmt, 0);
2975         }
2976       return NULL;
2977     }
2978
2979   if (! flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)))
2980     return NULL;
2981
2982   nloop_uses = 0;
2983   auto_vec<gphi *, 3> lcphis;
2984   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2985     {
2986       gimple *use_stmt = USE_STMT (use_p);
2987       if (is_gimple_debug (use_stmt))
2988         continue;
2989       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2990         nloop_uses++;
2991       else
2992         /* We can have more than one loop-closed PHI.  */
2993         lcphis.safe_push (as_a <gphi *> (use_stmt));
2994       if (nloop_uses > 1)
2995         {
2996           if (dump_enabled_p ())
2997             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2998                              "reduction used in loop.\n");
2999           return NULL;
3000         }
3001     }
3002
3003   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3004      defined in the inner loop.  */
3005   if (phi_def)
3006     {
3007       op1 = PHI_ARG_DEF (def_stmt, 0);
3008
3009       if (gimple_phi_num_args (def_stmt) != 1
3010           || TREE_CODE (op1) != SSA_NAME)
3011         {
3012           if (dump_enabled_p ())
3013             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3014                              "unsupported phi node definition.\n");
3015
3016           return NULL;
3017         }
3018
3019       def1 = SSA_NAME_DEF_STMT (op1);
3020       if (gimple_bb (def1)
3021           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3022           && loop->inner
3023           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3024           && is_gimple_assign (def1)
3025           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3026         {
3027           if (dump_enabled_p ())
3028             report_vect_op (MSG_NOTE, def_stmt,
3029                             "detected double reduction: ");
3030
3031           *double_reduc = true;
3032           return def_stmt;
3033         }
3034
3035       return NULL;
3036     }
3037
3038   /* If we are vectorizing an inner reduction we are executing that
3039      in the original order only in case we are not dealing with a
3040      double reduction.  */
3041   bool check_reduction = true;
3042   if (flow_loop_nested_p (vect_loop, loop))
3043     {
3044       gphi *lcphi;
3045       unsigned i;
3046       check_reduction = false;
3047       FOR_EACH_VEC_ELT (lcphis, i, lcphi)
3048         FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result (lcphi))
3049           {
3050             gimple *use_stmt = USE_STMT (use_p);
3051             if (is_gimple_debug (use_stmt))
3052               continue;
3053             if (! flow_bb_inside_loop_p (vect_loop, gimple_bb (use_stmt)))
3054               check_reduction = true;
3055           }
3056     }
3057
3058   bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
3059   code = orig_code = gimple_assign_rhs_code (def_stmt);
3060
3061   /* We can handle "res -= x[i]", which is non-associative by
3062      simply rewriting this into "res += -x[i]".  Avoid changing
3063      gimple instruction for the first simple tests and only do this
3064      if we're allowed to change code at all.  */
3065   if (code == MINUS_EXPR && gimple_assign_rhs2 (def_stmt) != phi_name)
3066     code = PLUS_EXPR;
3067
3068   if (code == COND_EXPR)
3069     {
3070       if (! nested_in_vect_loop)
3071         *v_reduc_type = COND_REDUCTION;
3072
3073       op3 = gimple_assign_rhs1 (def_stmt);
3074       if (COMPARISON_CLASS_P (op3))
3075         {
3076           op4 = TREE_OPERAND (op3, 1);
3077           op3 = TREE_OPERAND (op3, 0);
3078         }
3079       if (op3 == phi_name || op4 == phi_name)
3080         {
3081           if (dump_enabled_p ())
3082             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3083                             "reduction: condition depends on previous"
3084                             " iteration: ");
3085           return NULL;
3086         }
3087
3088       op1 = gimple_assign_rhs2 (def_stmt);
3089       op2 = gimple_assign_rhs3 (def_stmt);
3090     }
3091   else if (!commutative_tree_code (code) || !associative_tree_code (code))
3092     {
3093       if (dump_enabled_p ())
3094         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3095                         "reduction: not commutative/associative: ");
3096       return NULL;
3097     }
3098   else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS)
3099     {
3100       op1 = gimple_assign_rhs1 (def_stmt);
3101       op2 = gimple_assign_rhs2 (def_stmt);
3102     }
3103   else
3104     {
3105       if (dump_enabled_p ())
3106         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3107                         "reduction: not handled operation: ");
3108       return NULL;
3109     }
3110
3111   if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
3112     {
3113       if (dump_enabled_p ())
3114         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3115                         "reduction: both uses not ssa_names: ");
3116
3117       return NULL;
3118     }
3119
3120   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
3121   if ((TREE_CODE (op1) == SSA_NAME
3122        && !types_compatible_p (type,TREE_TYPE (op1)))
3123       || (TREE_CODE (op2) == SSA_NAME
3124           && !types_compatible_p (type, TREE_TYPE (op2)))
3125       || (op3 && TREE_CODE (op3) == SSA_NAME
3126           && !types_compatible_p (type, TREE_TYPE (op3)))
3127       || (op4 && TREE_CODE (op4) == SSA_NAME
3128           && !types_compatible_p (type, TREE_TYPE (op4))))
3129     {
3130       if (dump_enabled_p ())
3131         {
3132           dump_printf_loc (MSG_NOTE, vect_location,
3133                            "reduction: multiple types: operation type: ");
3134           dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
3135           dump_printf (MSG_NOTE, ", operands types: ");
3136           dump_generic_expr (MSG_NOTE, TDF_SLIM,
3137                              TREE_TYPE (op1));
3138           dump_printf (MSG_NOTE, ",");
3139           dump_generic_expr (MSG_NOTE, TDF_SLIM,
3140                              TREE_TYPE (op2));
3141           if (op3)
3142             {
3143               dump_printf (MSG_NOTE, ",");
3144               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3145                                  TREE_TYPE (op3));
3146             }
3147
3148           if (op4)
3149             {
3150               dump_printf (MSG_NOTE, ",");
3151               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3152                                  TREE_TYPE (op4));
3153             }
3154           dump_printf (MSG_NOTE, "\n");
3155         }
3156
3157       return NULL;
3158     }
3159
3160   /* Check whether it's ok to change the order of the computation.
3161      Generally, when vectorizing a reduction we change the order of the
3162      computation.  This may change the behavior of the program in some
3163      cases, so we need to check that this is ok.  One exception is when
3164      vectorizing an outer-loop: the inner-loop is executed sequentially,
3165      and therefore vectorizing reductions in the inner-loop during
3166      outer-loop vectorization is safe.  */
3167   if (check_reduction
3168       && *v_reduc_type == TREE_CODE_REDUCTION
3169       && needs_fold_left_reduction_p (type, code,
3170                                       need_wrapping_integral_overflow))
3171     *v_reduc_type = FOLD_LEFT_REDUCTION;
3172
3173   /* Reduction is safe. We're dealing with one of the following:
3174      1) integer arithmetic and no trapv
3175      2) floating point arithmetic, and special flags permit this optimization
3176      3) nested cycle (i.e., outer loop vectorization).  */
3177   if (TREE_CODE (op1) == SSA_NAME)
3178     def1 = SSA_NAME_DEF_STMT (op1);
3179
3180   if (TREE_CODE (op2) == SSA_NAME)
3181     def2 = SSA_NAME_DEF_STMT (op2);
3182
3183   if (code != COND_EXPR
3184       && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
3185     {
3186       if (dump_enabled_p ())
3187         report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
3188       return NULL;
3189     }
3190
3191   /* Check that one def is the reduction def, defined by PHI,
3192      the other def is either defined in the loop ("vect_internal_def"),
3193      or it's an induction (defined by a loop-header phi-node).  */
3194
3195   if (def2 && def2 == phi
3196       && (code == COND_EXPR
3197           || !def1 || gimple_nop_p (def1)
3198           || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
3199           || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
3200               && (is_gimple_assign (def1)
3201                   || is_gimple_call (def1)
3202                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
3203                       == vect_induction_def
3204                   || (gimple_code (def1) == GIMPLE_PHI
3205                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
3206                           == vect_internal_def
3207                       && !is_loop_header_bb_p (gimple_bb (def1)))))))
3208     {
3209       if (dump_enabled_p ())
3210         report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3211       return def_stmt;
3212     }
3213
3214   if (def1 && def1 == phi
3215       && (code == COND_EXPR
3216           || !def2 || gimple_nop_p (def2)
3217           || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
3218           || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
3219               && (is_gimple_assign (def2)
3220                   || is_gimple_call (def2)
3221                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
3222                        == vect_induction_def
3223                   || (gimple_code (def2) == GIMPLE_PHI
3224                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
3225                            == vect_internal_def
3226                       && !is_loop_header_bb_p (gimple_bb (def2)))))))
3227     {
3228       if (! nested_in_vect_loop && orig_code != MINUS_EXPR)
3229         {
3230           /* Check if we can swap operands (just for simplicity - so that
3231              the rest of the code can assume that the reduction variable
3232              is always the last (second) argument).  */
3233           if (code == COND_EXPR)
3234             {
3235               /* Swap cond_expr by inverting the condition.  */
3236               tree cond_expr = gimple_assign_rhs1 (def_stmt);
3237               enum tree_code invert_code = ERROR_MARK;
3238               enum tree_code cond_code = TREE_CODE (cond_expr);
3239
3240               if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
3241                 {
3242                   bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
3243                   invert_code = invert_tree_comparison (cond_code, honor_nans);
3244                 }
3245               if (invert_code != ERROR_MARK)
3246                 {
3247                   TREE_SET_CODE (cond_expr, invert_code);
3248                   swap_ssa_operands (def_stmt,
3249                                      gimple_assign_rhs2_ptr (def_stmt),
3250                                      gimple_assign_rhs3_ptr (def_stmt));
3251                 }
3252               else
3253                 {
3254                   if (dump_enabled_p ())
3255                     report_vect_op (MSG_NOTE, def_stmt,
3256                                     "detected reduction: cannot swap operands "
3257                                     "for cond_expr");
3258                   return NULL;
3259                 }
3260             }
3261           else
3262             swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
3263                                gimple_assign_rhs2_ptr (def_stmt));
3264
3265           if (dump_enabled_p ())
3266             report_vect_op (MSG_NOTE, def_stmt,
3267                             "detected reduction: need to swap operands: ");
3268
3269           if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
3270             LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
3271         }
3272       else
3273         {
3274           if (dump_enabled_p ())
3275             report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3276         }
3277
3278       return def_stmt;
3279     }
3280
3281   /* Try to find SLP reduction chain.  */
3282   if (! nested_in_vect_loop
3283       && code != COND_EXPR
3284       && orig_code != MINUS_EXPR
3285       && vect_is_slp_reduction (loop_info, phi, def_stmt))
3286     {
3287       if (dump_enabled_p ())
3288         report_vect_op (MSG_NOTE, def_stmt,
3289                         "reduction: detected reduction chain: ");
3290
3291       return def_stmt;
3292     }
3293
3294   /* Dissolve group eventually half-built by vect_is_slp_reduction.  */
3295   gimple *first = REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (def_stmt));
3296   while (first)
3297     {
3298       gimple *next = REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
3299       REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (first)) = NULL;
3300       REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (first)) = NULL;
3301       first = next;
3302     }
3303
3304   /* Look for the expression computing loop_arg from loop PHI result.  */
3305   if (check_reduction_path (vect_location, loop, as_a <gphi *> (phi), loop_arg,
3306                             code))
3307     return def_stmt;
3308
3309   if (dump_enabled_p ())
3310     {
3311       report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3312                       "reduction: unknown pattern: ");
3313     }
3314
3315   return NULL;
3316 }
3317
3318 /* Wrapper around vect_is_simple_reduction, which will modify code
3319    in-place if it enables detection of more reductions.  Arguments
3320    as there.  */
3321
3322 gimple *
3323 vect_force_simple_reduction (loop_vec_info loop_info, gimple *phi,
3324                              bool *double_reduc,
3325                              bool need_wrapping_integral_overflow)
3326 {
3327   enum vect_reduction_type v_reduc_type;
3328   gimple *def = vect_is_simple_reduction (loop_info, phi, double_reduc,
3329                                           need_wrapping_integral_overflow,
3330                                           &v_reduc_type);
3331   if (def)
3332     {
3333       stmt_vec_info reduc_def_info = vinfo_for_stmt (phi);
3334       STMT_VINFO_REDUC_TYPE (reduc_def_info) = v_reduc_type;
3335       STMT_VINFO_REDUC_DEF (reduc_def_info) = def;
3336       reduc_def_info = vinfo_for_stmt (def);
3337       STMT_VINFO_REDUC_TYPE (reduc_def_info) = v_reduc_type;
3338       STMT_VINFO_REDUC_DEF (reduc_def_info) = phi;
3339     }
3340   return def;
3341 }
3342
3343 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
3344 int
3345 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3346                              int *peel_iters_epilogue,
3347                              stmt_vector_for_cost *scalar_cost_vec,
3348                              stmt_vector_for_cost *prologue_cost_vec,
3349                              stmt_vector_for_cost *epilogue_cost_vec)
3350 {
3351   int retval = 0;
3352   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3353
3354   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3355     {
3356       *peel_iters_epilogue = assumed_vf / 2;
3357       if (dump_enabled_p ())
3358         dump_printf_loc (MSG_NOTE, vect_location,
3359                          "cost model: epilogue peel iters set to vf/2 "
3360                          "because loop iterations are unknown .\n");
3361
3362       /* If peeled iterations are known but number of scalar loop
3363          iterations are unknown, count a taken branch per peeled loop.  */
3364       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3365                                  NULL, 0, vect_prologue);
3366       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3367                                  NULL, 0, vect_epilogue);
3368     }
3369   else
3370     {
3371       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3372       peel_iters_prologue = niters < peel_iters_prologue ?
3373                             niters : peel_iters_prologue;
3374       *peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3375       /* If we need to peel for gaps, but no peeling is required, we have to
3376          peel VF iterations.  */
3377       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3378         *peel_iters_epilogue = assumed_vf;
3379     }
3380
3381   stmt_info_for_cost *si;
3382   int j;
3383   if (peel_iters_prologue)
3384     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3385         {
3386           stmt_vec_info stmt_info
3387             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3388           retval += record_stmt_cost (prologue_cost_vec,
3389                                       si->count * peel_iters_prologue,
3390                                       si->kind, stmt_info, si->misalign,
3391                                       vect_prologue);
3392         }
3393   if (*peel_iters_epilogue)
3394     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3395         {
3396           stmt_vec_info stmt_info
3397             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3398           retval += record_stmt_cost (epilogue_cost_vec,
3399                                       si->count * *peel_iters_epilogue,
3400                                       si->kind, stmt_info, si->misalign,
3401                                       vect_epilogue);
3402         }
3403
3404   return retval;
3405 }
3406
3407 /* Function vect_estimate_min_profitable_iters
3408
3409    Return the number of iterations required for the vector version of the
3410    loop to be profitable relative to the cost of the scalar version of the
3411    loop.
3412
3413    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3414    of iterations for vectorization.  -1 value means loop vectorization
3415    is not profitable.  This returned value may be used for dynamic
3416    profitability check.
3417
3418    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3419    for static check against estimated number of iterations.  */
3420
3421 static void
3422 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3423                                     int *ret_min_profitable_niters,
3424                                     int *ret_min_profitable_estimate)
3425 {
3426   int min_profitable_iters;
3427   int min_profitable_estimate;
3428   int peel_iters_prologue;
3429   int peel_iters_epilogue;
3430   unsigned vec_inside_cost = 0;
3431   int vec_outside_cost = 0;
3432   unsigned vec_prologue_cost = 0;
3433   unsigned vec_epilogue_cost = 0;
3434   int scalar_single_iter_cost = 0;
3435   int scalar_outside_cost = 0;
3436   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3437   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3438   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3439
3440   /* Cost model disabled.  */
3441   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3442     {
3443       dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3444       *ret_min_profitable_niters = 0;
3445       *ret_min_profitable_estimate = 0;
3446       return;
3447     }
3448
3449   /* Requires loop versioning tests to handle misalignment.  */
3450   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3451     {
3452       /*  FIXME: Make cost depend on complexity of individual check.  */
3453       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3454       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3455                             vect_prologue);
3456       dump_printf (MSG_NOTE,
3457                    "cost model: Adding cost of checks for loop "
3458                    "versioning to treat misalignment.\n");
3459     }
3460
3461   /* Requires loop versioning with alias checks.  */
3462   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3463     {
3464       /*  FIXME: Make cost depend on complexity of individual check.  */
3465       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3466       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3467                             vect_prologue);
3468       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3469       if (len)
3470         /* Count LEN - 1 ANDs and LEN comparisons.  */
3471         (void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt,
3472                               NULL, 0, vect_prologue);
3473       len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3474       if (len)
3475         {
3476           /* Count LEN - 1 ANDs and LEN comparisons.  */
3477           unsigned int nstmts = len * 2 - 1;
3478           /* +1 for each bias that needs adding.  */
3479           for (unsigned int i = 0; i < len; ++i)
3480             if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3481               nstmts += 1;
3482           (void) add_stmt_cost (target_cost_data, nstmts, scalar_stmt,
3483                                 NULL, 0, vect_prologue);
3484         }
3485       dump_printf (MSG_NOTE,
3486                    "cost model: Adding cost of checks for loop "
3487                    "versioning aliasing.\n");
3488     }
3489
3490   /* Requires loop versioning with niter checks.  */
3491   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3492     {
3493       /*  FIXME: Make cost depend on complexity of individual check.  */
3494       (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3495                             vect_prologue);
3496       dump_printf (MSG_NOTE,
3497                    "cost model: Adding cost of checks for loop "
3498                    "versioning niters.\n");
3499     }
3500
3501   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3502     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3503                           vect_prologue);
3504
3505   /* Count statements in scalar loop.  Using this as scalar cost for a single
3506      iteration for now.
3507
3508      TODO: Add outer loop support.
3509
3510      TODO: Consider assigning different costs to different scalar
3511      statements.  */
3512
3513   scalar_single_iter_cost
3514     = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3515
3516   /* Add additional cost for the peeled instructions in prologue and epilogue
3517      loop.  (For fully-masked loops there will be no peeling.)
3518
3519      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3520      at compile-time - we assume it's vf/2 (the worst would be vf-1).
3521
3522      TODO: Build an expression that represents peel_iters for prologue and
3523      epilogue to be used in a run-time test.  */
3524
3525   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3526     {
3527       peel_iters_prologue = 0;
3528       peel_iters_epilogue = 0;
3529
3530       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3531         {
3532           /* We need to peel exactly one iteration.  */
3533           peel_iters_epilogue += 1;
3534           stmt_info_for_cost *si;
3535           int j;
3536           FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
3537                             j, si)
3538             {
3539               struct _stmt_vec_info *stmt_info
3540                 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3541               (void) add_stmt_cost (target_cost_data, si->count,
3542                                     si->kind, stmt_info, si->misalign,
3543                                     vect_epilogue);
3544             }
3545         }
3546     }
3547   else if (npeel < 0)
3548     {
3549       peel_iters_prologue = assumed_vf / 2;
3550       dump_printf (MSG_NOTE, "cost model: "
3551                    "prologue peel iters set to vf/2.\n");
3552
3553       /* If peeling for alignment is unknown, loop bound of main loop becomes
3554          unknown.  */
3555       peel_iters_epilogue = assumed_vf / 2;
3556       dump_printf (MSG_NOTE, "cost model: "
3557                    "epilogue peel iters set to vf/2 because "
3558                    "peeling for alignment is unknown.\n");
3559
3560       /* If peeled iterations are unknown, count a taken branch and a not taken
3561          branch per peeled loop. Even if scalar loop iterations are known,
3562          vector iterations are not known since peeled prologue iterations are
3563          not known. Hence guards remain the same.  */
3564       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3565                             NULL, 0, vect_prologue);
3566       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3567                             NULL, 0, vect_prologue);
3568       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3569                             NULL, 0, vect_epilogue);
3570       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3571                             NULL, 0, vect_epilogue);
3572       stmt_info_for_cost *si;
3573       int j;
3574       FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3575         {
3576           struct _stmt_vec_info *stmt_info
3577             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3578           (void) add_stmt_cost (target_cost_data,
3579                                 si->count * peel_iters_prologue,
3580                                 si->kind, stmt_info, si->misalign,
3581                                 vect_prologue);
3582           (void) add_stmt_cost (target_cost_data,
3583                                 si->count * peel_iters_epilogue,
3584                                 si->kind, stmt_info, si->misalign,
3585                                 vect_epilogue);
3586         }
3587     }
3588   else
3589     {
3590       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3591       stmt_info_for_cost *si;
3592       int j;
3593       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3594
3595       prologue_cost_vec.create (2);
3596       epilogue_cost_vec.create (2);
3597       peel_iters_prologue = npeel;
3598
3599       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3600                                           &peel_iters_epilogue,
3601                                           &LOOP_VINFO_SCALAR_ITERATION_COST
3602                                             (loop_vinfo),
3603                                           &prologue_cost_vec,
3604                                           &epilogue_cost_vec);
3605
3606       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3607         {
3608           struct _stmt_vec_info *stmt_info
3609             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3610           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3611                                 si->misalign, vect_prologue);
3612         }
3613
3614       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3615         {
3616           struct _stmt_vec_info *stmt_info
3617             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3618           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3619                                 si->misalign, vect_epilogue);
3620         }
3621
3622       prologue_cost_vec.release ();
3623       epilogue_cost_vec.release ();
3624     }
3625
3626   /* FORNOW: The scalar outside cost is incremented in one of the
3627      following ways:
3628
3629      1. The vectorizer checks for alignment and aliasing and generates
3630      a condition that allows dynamic vectorization.  A cost model
3631      check is ANDED with the versioning condition.  Hence scalar code
3632      path now has the added cost of the versioning check.
3633
3634        if (cost > th & versioning_check)
3635          jmp to vector code
3636
3637      Hence run-time scalar is incremented by not-taken branch cost.
3638
3639      2. The vectorizer then checks if a prologue is required.  If the
3640      cost model check was not done before during versioning, it has to
3641      be done before the prologue check.
3642
3643        if (cost <= th)
3644          prologue = scalar_iters
3645        if (prologue == 0)
3646          jmp to vector code
3647        else
3648          execute prologue
3649        if (prologue == num_iters)
3650          go to exit
3651
3652      Hence the run-time scalar cost is incremented by a taken branch,
3653      plus a not-taken branch, plus a taken branch cost.
3654
3655      3. The vectorizer then checks if an epilogue is required.  If the
3656      cost model check was not done before during prologue check, it
3657      has to be done with the epilogue check.
3658
3659        if (prologue == 0)
3660          jmp to vector code
3661        else
3662          execute prologue
3663        if (prologue == num_iters)
3664          go to exit
3665        vector code:
3666          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3667            jmp to epilogue
3668
3669      Hence the run-time scalar cost should be incremented by 2 taken
3670      branches.
3671
3672      TODO: The back end may reorder the BBS's differently and reverse
3673      conditions/branch directions.  Change the estimates below to
3674      something more reasonable.  */
3675
3676   /* If the number of iterations is known and we do not do versioning, we can
3677      decide whether to vectorize at compile time.  Hence the scalar version
3678      do not carry cost model guard costs.  */
3679   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3680       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3681     {
3682       /* Cost model check occurs at versioning.  */
3683       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3684         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3685       else
3686         {
3687           /* Cost model check occurs at prologue generation.  */
3688           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3689             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3690               + vect_get_stmt_cost (cond_branch_not_taken);
3691           /* Cost model check occurs at epilogue generation.  */
3692           else
3693             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3694         }
3695     }
3696
3697   /* Complete the target-specific cost calculations.  */
3698   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3699                &vec_inside_cost, &vec_epilogue_cost);
3700
3701   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3702
3703   if (dump_enabled_p ())
3704     {
3705       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3706       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
3707                    vec_inside_cost);
3708       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
3709                    vec_prologue_cost);
3710       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
3711                    vec_epilogue_cost);
3712       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
3713                    scalar_single_iter_cost);
3714       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
3715                    scalar_outside_cost);
3716       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
3717                    vec_outside_cost);
3718       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
3719                    peel_iters_prologue);
3720       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
3721                    peel_iters_epilogue);
3722     }
3723
3724   /* Calculate number of iterations required to make the vector version
3725      profitable, relative to the loop bodies only.  The following condition
3726      must hold true:
3727      SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
3728      where
3729      SIC = scalar iteration cost, VIC = vector iteration cost,
3730      VOC = vector outside cost, VF = vectorization factor,
3731      PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
3732      SOC = scalar outside cost for run time cost model check.  */
3733
3734   if ((scalar_single_iter_cost * assumed_vf) > (int) vec_inside_cost)
3735     {
3736       min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
3737                               * assumed_vf
3738                               - vec_inside_cost * peel_iters_prologue
3739                               - vec_inside_cost * peel_iters_epilogue);
3740       if (min_profitable_iters <= 0)
3741         min_profitable_iters = 0;
3742       else
3743         {
3744           min_profitable_iters /= ((scalar_single_iter_cost * assumed_vf)
3745                                    - vec_inside_cost);
3746
3747           if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
3748               <= (((int) vec_inside_cost * min_profitable_iters)
3749                   + (((int) vec_outside_cost - scalar_outside_cost)
3750                      * assumed_vf)))
3751             min_profitable_iters++;
3752         }
3753     }
3754   /* vector version will never be profitable.  */
3755   else
3756     {
3757       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3758         warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
3759                     "vectorization did not happen for a simd loop");
3760
3761       if (dump_enabled_p ())
3762         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3763                          "cost model: the vector iteration cost = %d "
3764                          "divided by the scalar iteration cost = %d "
3765                          "is greater or equal to the vectorization factor = %d"
3766                          ".\n",
3767                          vec_inside_cost, scalar_single_iter_cost, assumed_vf);
3768       *ret_min_profitable_niters = -1;
3769       *ret_min_profitable_estimate = -1;
3770       return;
3771     }
3772
3773   dump_printf (MSG_NOTE,
3774                "  Calculated minimum iters for profitability: %d\n",
3775                min_profitable_iters);
3776
3777   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
3778       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
3779     /* We want the vectorized loop to execute at least once.  */
3780     min_profitable_iters = assumed_vf + peel_iters_prologue;
3781
3782   if (dump_enabled_p ())
3783     dump_printf_loc (MSG_NOTE, vect_location,
3784                      "  Runtime profitability threshold = %d\n",
3785                      min_profitable_iters);
3786
3787   *ret_min_profitable_niters = min_profitable_iters;
3788
3789   /* Calculate number of iterations required to make the vector version
3790      profitable, relative to the loop bodies only.
3791
3792      Non-vectorized variant is SIC * niters and it must win over vector
3793      variant on the expected loop trip count.  The following condition must hold true:
3794      SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC  */
3795
3796   if (vec_outside_cost <= 0)
3797     min_profitable_estimate = 0;
3798   else
3799     {
3800       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
3801                                  * assumed_vf
3802                                  - vec_inside_cost * peel_iters_prologue
3803                                  - vec_inside_cost * peel_iters_epilogue)
3804                                  / ((scalar_single_iter_cost * assumed_vf)
3805                                    - vec_inside_cost);
3806     }
3807   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3808   if (dump_enabled_p ())
3809     dump_printf_loc (MSG_NOTE, vect_location,
3810                      "  Static estimate profitability threshold = %d\n",
3811                      min_profitable_estimate);
3812
3813   *ret_min_profitable_estimate = min_profitable_estimate;
3814 }
3815
3816 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3817    vector elements (not bits) for a vector with NELT elements.  */
3818 static void
3819 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
3820                               vec_perm_builder *sel)
3821 {
3822   /* The encoding is a single stepped pattern.  Any wrap-around is handled
3823      by vec_perm_indices.  */
3824   sel->new_vector (nelt, 1, 3);
3825   for (unsigned int i = 0; i < 3; i++)
3826     sel->quick_push (i + offset);
3827 }
3828
3829 /* Checks whether the target supports whole-vector shifts for vectors of mode
3830    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
3831    it supports vec_perm_const with masks for all necessary shift amounts.  */
3832 static bool
3833 have_whole_vector_shift (machine_mode mode)
3834 {
3835   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3836     return true;
3837
3838   /* Variable-length vectors should be handled via the optab.  */
3839   unsigned int nelt;
3840   if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
3841     return false;
3842
3843   vec_perm_builder sel;
3844   vec_perm_indices indices;
3845   for (unsigned int i = nelt / 2; i >= 1; i /= 2)
3846     {
3847       calc_vec_perm_mask_for_shift (i, nelt, &sel);
3848       indices.new_vector (sel, 2, nelt);
3849       if (!can_vec_perm_const_p (mode, indices, false))
3850         return false;
3851     }
3852   return true;
3853 }
3854
3855 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3856    functions. Design better to avoid maintenance issues.  */
3857
3858 /* Function vect_model_reduction_cost.
3859
3860    Models cost for a reduction operation, including the vector ops
3861    generated within the strip-mine loop, the initial definition before
3862    the loop, and the epilogue code that must be generated.  */
3863
3864 static void
3865 vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
3866                            int ncopies, stmt_vector_for_cost *cost_vec)
3867 {
3868   int prologue_cost = 0, epilogue_cost = 0, inside_cost;
3869   enum tree_code code;
3870   optab optab;
3871   tree vectype;
3872   gimple *orig_stmt;
3873   machine_mode mode;
3874   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3875   struct loop *loop = NULL;
3876
3877   if (loop_vinfo)
3878     loop = LOOP_VINFO_LOOP (loop_vinfo);
3879
3880   /* Condition reductions generate two reductions in the loop.  */
3881   vect_reduction_type reduction_type
3882     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
3883   if (reduction_type == COND_REDUCTION)
3884     ncopies *= 2;
3885
3886   vectype = STMT_VINFO_VECTYPE (stmt_info);
3887   mode = TYPE_MODE (vectype);
3888   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3889
3890   if (!orig_stmt)
3891     orig_stmt = STMT_VINFO_STMT (stmt_info);
3892
3893   code = gimple_assign_rhs_code (orig_stmt);
3894
3895   if (reduction_type == EXTRACT_LAST_REDUCTION
3896       || reduction_type == FOLD_LEFT_REDUCTION)
3897     {
3898       /* No extra instructions needed in the prologue.  */
3899       prologue_cost = 0;
3900
3901       if (reduction_type == EXTRACT_LAST_REDUCTION || reduc_fn != IFN_LAST)
3902         /* Count one reduction-like operation per vector.  */
3903         inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
3904                                         stmt_info, 0, vect_body);
3905       else
3906         {
3907           /* Use NELEMENTS extracts and NELEMENTS scalar ops.  */
3908           unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
3909           inside_cost = record_stmt_cost (cost_vec, nelements,
3910                                           vec_to_scalar, stmt_info, 0,
3911                                           vect_body);
3912           inside_cost += record_stmt_cost (cost_vec, nelements,
3913                                            scalar_stmt, stmt_info, 0,
3914                                            vect_body);
3915         }
3916     }
3917   else
3918     {
3919       /* Add in cost for initial definition.
3920          For cond reduction we have four vectors: initial index, step,
3921          initial result of the data reduction, initial value of the index
3922          reduction.  */
3923       int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
3924       prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
3925                                          scalar_to_vec, stmt_info, 0,
3926                                          vect_prologue);
3927
3928       /* Cost of reduction op inside loop.  */
3929       inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
3930                                       stmt_info, 0, vect_body);
3931     }
3932
3933   /* Determine cost of epilogue code.
3934
3935      We have a reduction operator that will reduce the vector in one statement.
3936      Also requires scalar extract.  */
3937
3938   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt))
3939     {
3940       if (reduc_fn != IFN_LAST)
3941         {
3942           if (reduction_type == COND_REDUCTION)
3943             {
3944               /* An EQ stmt and an COND_EXPR stmt.  */
3945               epilogue_cost += record_stmt_cost (cost_vec, 2,
3946                                                  vector_stmt, stmt_info, 0,
3947                                                  vect_epilogue);
3948               /* Reduction of the max index and a reduction of the found
3949                  values.  */
3950               epilogue_cost += record_stmt_cost (cost_vec, 2,
3951                                                  vec_to_scalar, stmt_info, 0,
3952                                                  vect_epilogue);
3953               /* A broadcast of the max value.  */
3954               epilogue_cost += record_stmt_cost (cost_vec, 1,
3955                                                  scalar_to_vec, stmt_info, 0,
3956                                                  vect_epilogue);
3957             }
3958           else
3959             {
3960               epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
3961                                                  stmt_info, 0, vect_epilogue);
3962               epilogue_cost += record_stmt_cost (cost_vec, 1,
3963                                                  vec_to_scalar, stmt_info, 0,
3964                                                  vect_epilogue);
3965             }
3966         }
3967       else if (reduction_type == COND_REDUCTION)
3968         {
3969           unsigned estimated_nunits = vect_nunits_for_cost (vectype);
3970           /* Extraction of scalar elements.  */
3971           epilogue_cost += record_stmt_cost (cost_vec,
3972                                              2 * estimated_nunits,
3973                                              vec_to_scalar, stmt_info, 0,
3974                                              vect_epilogue);
3975           /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
3976           epilogue_cost += record_stmt_cost (cost_vec,
3977                                              2 * estimated_nunits - 3,
3978                                              scalar_stmt, stmt_info, 0,
3979                                              vect_epilogue);
3980         }
3981       else if (reduction_type == EXTRACT_LAST_REDUCTION
3982                || reduction_type == FOLD_LEFT_REDUCTION)
3983         /* No extra instructions need in the epilogue.  */
3984         ;
3985       else
3986         {
3987           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3988           tree bitsize =
3989             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
3990           int element_bitsize = tree_to_uhwi (bitsize);
3991           int nelements = vec_size_in_bits / element_bitsize;
3992
3993           if (code == COND_EXPR)
3994             code = MAX_EXPR;
3995
3996           optab = optab_for_tree_code (code, vectype, optab_default);
3997
3998           /* We have a whole vector shift available.  */
3999           if (optab != unknown_optab
4000               && VECTOR_MODE_P (mode)
4001               && optab_handler (optab, mode) != CODE_FOR_nothing
4002               && have_whole_vector_shift (mode))
4003             {
4004               /* Final reduction via vector shifts and the reduction operator.
4005                  Also requires scalar extract.  */
4006               epilogue_cost += record_stmt_cost (cost_vec,
4007                                                  exact_log2 (nelements) * 2,
4008                                                  vector_stmt, stmt_info, 0,
4009                                                  vect_epilogue);
4010               epilogue_cost += record_stmt_cost (cost_vec, 1,
4011                                                  vec_to_scalar, stmt_info, 0,
4012                                                  vect_epilogue);
4013             }
4014           else
4015             /* Use extracts and reduction op for final reduction.  For N
4016                elements, we have N extracts and N-1 reduction ops.  */
4017             epilogue_cost += record_stmt_cost (cost_vec,
4018                                                nelements + nelements - 1,
4019                                                vector_stmt, stmt_info, 0,
4020                                                vect_epilogue);
4021         }
4022     }
4023
4024   if (dump_enabled_p ())
4025     dump_printf (MSG_NOTE,
4026                  "vect_model_reduction_cost: inside_cost = %d, "
4027                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4028                  prologue_cost, epilogue_cost);
4029 }
4030
4031
4032 /* Function vect_model_induction_cost.
4033
4034    Models cost for induction operations.  */
4035
4036 static void
4037 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies,
4038                            stmt_vector_for_cost *cost_vec)
4039 {
4040   unsigned inside_cost, prologue_cost;
4041
4042   if (PURE_SLP_STMT (stmt_info))
4043     return;
4044
4045   /* loop cost for vec_loop.  */
4046   inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
4047                                   stmt_info, 0, vect_body);
4048
4049   /* prologue cost for vec_init and vec_step.  */
4050   prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
4051                                     stmt_info, 0, vect_prologue);
4052
4053   if (dump_enabled_p ())
4054     dump_printf_loc (MSG_NOTE, vect_location,
4055                      "vect_model_induction_cost: inside_cost = %d, "
4056                      "prologue_cost = %d .\n", inside_cost, prologue_cost);
4057 }
4058
4059
4060
4061 /* Function get_initial_def_for_reduction
4062
4063    Input:
4064    STMT - a stmt that performs a reduction operation in the loop.
4065    INIT_VAL - the initial value of the reduction variable
4066
4067    Output:
4068    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4069         of the reduction (used for adjusting the epilog - see below).
4070    Return a vector variable, initialized according to the operation that STMT
4071         performs. This vector will be used as the initial value of the
4072         vector of partial results.
4073
4074    Option1 (adjust in epilog): Initialize the vector as follows:
4075      add/bit or/xor:    [0,0,...,0,0]
4076      mult/bit and:      [1,1,...,1,1]
4077      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4078    and when necessary (e.g. add/mult case) let the caller know
4079    that it needs to adjust the result by init_val.
4080
4081    Option2: Initialize the vector as follows:
4082      add/bit or/xor:    [init_val,0,0,...,0]
4083      mult/bit and:      [init_val,1,1,...,1]
4084      min/max/cond_expr: [init_val,init_val,...,init_val]
4085    and no adjustments are needed.
4086
4087    For example, for the following code:
4088
4089    s = init_val;
4090    for (i=0;i<n;i++)
4091      s = s + a[i];
4092
4093    STMT is 's = s + a[i]', and the reduction variable is 's'.
4094    For a vector of 4 units, we want to return either [0,0,0,init_val],
4095    or [0,0,0,0] and let the caller know that it needs to adjust
4096    the result at the end by 'init_val'.
4097
4098    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4099    initialization vector is simpler (same element in all entries), if
4100    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4101
4102    A cost model should help decide between these two schemes.  */
4103
4104 tree
4105 get_initial_def_for_reduction (gimple *stmt, tree init_val,
4106                                tree *adjustment_def)
4107 {
4108   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4109   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
4110   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4111   tree scalar_type = TREE_TYPE (init_val);
4112   tree vectype = get_vectype_for_scalar_type (scalar_type);
4113   enum tree_code code = gimple_assign_rhs_code (stmt);
4114   tree def_for_init;
4115   tree init_def;
4116   bool nested_in_vect_loop = false;
4117   REAL_VALUE_TYPE real_init_val = dconst0;
4118   int int_init_val = 0;
4119   gimple *def_stmt = NULL;
4120   gimple_seq stmts = NULL;
4121
4122   gcc_assert (vectype);
4123
4124   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4125               || SCALAR_FLOAT_TYPE_P (scalar_type));
4126
4127   if (nested_in_vect_loop_p (loop, stmt))
4128     nested_in_vect_loop = true;
4129   else
4130     gcc_assert (loop == (gimple_bb (stmt))->loop_father);
4131
4132   /* In case of double reduction we only create a vector variable to be put
4133      in the reduction phi node.  The actual statement creation is done in
4134      vect_create_epilog_for_reduction.  */
4135   if (adjustment_def && nested_in_vect_loop
4136       && TREE_CODE (init_val) == SSA_NAME
4137       && (def_stmt = SSA_NAME_DEF_STMT (init_val))
4138       && gimple_code (def_stmt) == GIMPLE_PHI
4139       && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
4140       && vinfo_for_stmt (def_stmt)
4141       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
4142           == vect_double_reduction_def)
4143     {
4144       *adjustment_def = NULL;
4145       return vect_create_destination_var (init_val, vectype);
4146     }
4147
4148   vect_reduction_type reduction_type
4149     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo);
4150
4151   /* In case of a nested reduction do not use an adjustment def as
4152      that case is not supported by the epilogue generation correctly
4153      if ncopies is not one.  */
4154   if (adjustment_def && nested_in_vect_loop)
4155     {
4156       *adjustment_def = NULL;
4157       return vect_get_vec_def_for_operand (init_val, stmt);
4158     }
4159
4160   switch (code)
4161     {
4162     case WIDEN_SUM_EXPR:
4163     case DOT_PROD_EXPR:
4164     case SAD_EXPR:
4165     case PLUS_EXPR:
4166     case MINUS_EXPR:
4167     case BIT_IOR_EXPR:
4168     case BIT_XOR_EXPR:
4169     case MULT_EXPR:
4170     case BIT_AND_EXPR:
4171       {
4172         /* ADJUSTMENT_DEF is NULL when called from
4173            vect_create_epilog_for_reduction to vectorize double reduction.  */
4174         if (adjustment_def)
4175           *adjustment_def = init_val;
4176
4177         if (code == MULT_EXPR)
4178           {
4179             real_init_val = dconst1;
4180             int_init_val = 1;
4181           }
4182
4183         if (code == BIT_AND_EXPR)
4184           int_init_val = -1;
4185
4186         if (SCALAR_FLOAT_TYPE_P (scalar_type))
4187           def_for_init = build_real (scalar_type, real_init_val);
4188         else
4189           def_for_init = build_int_cst (scalar_type, int_init_val);
4190
4191         if (adjustment_def)
4192           /* Option1: the first element is '0' or '1' as well.  */
4193           init_def = gimple_build_vector_from_val (&stmts, vectype,
4194                                                    def_for_init);
4195         else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4196           {
4197             /* Option2 (variable length): the first element is INIT_VAL.  */
4198             init_def = gimple_build_vector_from_val (&stmts, vectype,
4199                                                      def_for_init);
4200             init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4201                                      vectype, init_def, init_val);
4202           }
4203         else
4204           {
4205             /* Option2: the first element is INIT_VAL.  */
4206             tree_vector_builder elts (vectype, 1, 2);
4207             elts.quick_push (init_val);
4208             elts.quick_push (def_for_init);
4209             init_def = gimple_build_vector (&stmts, &elts);
4210           }
4211       }
4212       break;
4213
4214     case MIN_EXPR:
4215     case MAX_EXPR:
4216     case COND_EXPR:
4217       {
4218         if (adjustment_def)
4219           {
4220             *adjustment_def = NULL_TREE;
4221             if (reduction_type != COND_REDUCTION
4222                 && reduction_type != EXTRACT_LAST_REDUCTION)
4223               {
4224                 init_def = vect_get_vec_def_for_operand (init_val, stmt);
4225                 break;
4226               }
4227           }
4228         init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4229         init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4230       }
4231       break;
4232
4233     default:
4234       gcc_unreachable ();
4235     }
4236
4237   if (stmts)
4238     gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4239   return init_def;
4240 }
4241
4242 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4243    NUMBER_OF_VECTORS is the number of vector defs to create.
4244    If NEUTRAL_OP is nonnull, introducing extra elements of that
4245    value will not change the result.  */
4246
4247 static void
4248 get_initial_defs_for_reduction (slp_tree slp_node,
4249                                 vec<tree> *vec_oprnds,
4250                                 unsigned int number_of_vectors,
4251                                 bool reduc_chain, tree neutral_op)
4252 {
4253   vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4254   gimple *stmt = stmts[0];
4255   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4256   unsigned HOST_WIDE_INT nunits;
4257   unsigned j, number_of_places_left_in_vector;
4258   tree vector_type;
4259   tree vop;
4260   int group_size = stmts.length ();
4261   unsigned int vec_num, i;
4262   unsigned number_of_copies = 1;
4263   vec<tree> voprnds;
4264   voprnds.create (number_of_vectors);
4265   struct loop *loop;
4266   auto_vec<tree, 16> permute_results;
4267
4268   vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4269
4270   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4271
4272   loop = (gimple_bb (stmt))->loop_father;
4273   gcc_assert (loop);
4274   edge pe = loop_preheader_edge (loop);
4275
4276   gcc_assert (!reduc_chain || neutral_op);
4277
4278   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4279      created vectors. It is greater than 1 if unrolling is performed.
4280
4281      For example, we have two scalar operands, s1 and s2 (e.g., group of
4282      strided accesses of size two), while NUNITS is four (i.e., four scalars
4283      of this type can be packed in a vector).  The output vector will contain
4284      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
4285      will be 2).
4286
4287      If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4288      vectors containing the operands.
4289
4290      For example, NUNITS is four as before, and the group size is 8
4291      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
4292      {s5, s6, s7, s8}.  */
4293
4294   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4295     nunits = group_size;
4296
4297   number_of_copies = nunits * number_of_vectors / group_size;
4298
4299   number_of_places_left_in_vector = nunits;
4300   bool constant_p = true;
4301   tree_vector_builder elts (vector_type, nunits, 1);
4302   elts.quick_grow (nunits);
4303   for (j = 0; j < number_of_copies; j++)
4304     {
4305       for (i = group_size - 1; stmts.iterate (i, &stmt); i--)
4306         {
4307           tree op;
4308           /* Get the def before the loop.  In reduction chain we have only
4309              one initial value.  */
4310           if ((j != (number_of_copies - 1)
4311                || (reduc_chain && i != 0))
4312               && neutral_op)
4313             op = neutral_op;
4314           else
4315             op = PHI_ARG_DEF_FROM_EDGE (stmt, pe);
4316
4317           /* Create 'vect_ = {op0,op1,...,opn}'.  */
4318           number_of_places_left_in_vector--;
4319           elts[number_of_places_left_in_vector] = op;
4320           if (!CONSTANT_CLASS_P (op))
4321             constant_p = false;
4322
4323           if (number_of_places_left_in_vector == 0)
4324             {
4325               gimple_seq ctor_seq = NULL;
4326               tree init;
4327               if (constant_p && !neutral_op
4328                   ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4329                   : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4330                 /* Build the vector directly from ELTS.  */
4331                 init = gimple_build_vector (&ctor_seq, &elts);
4332               else if (neutral_op)
4333                 {
4334                   /* Build a vector of the neutral value and shift the
4335                      other elements into place.  */
4336                   init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4337                                                        neutral_op);
4338                   int k = nunits;
4339                   while (k > 0 && elts[k - 1] == neutral_op)
4340                     k -= 1;
4341                   while (k > 0)
4342                     {
4343                       k -= 1;
4344                       init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4345                                            vector_type, init, elts[k]);
4346                     }
4347                 }
4348               else
4349                 {
4350                   /* First time round, duplicate ELTS to fill the
4351                      required number of vectors, then cherry pick the
4352                      appropriate result for each iteration.  */
4353                   if (vec_oprnds->is_empty ())
4354                     duplicate_and_interleave (&ctor_seq, vector_type, elts,
4355                                               number_of_vectors,
4356                                               permute_results);
4357                   init = permute_results[number_of_vectors - j - 1];
4358                 }
4359               if (ctor_seq != NULL)
4360                 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4361               voprnds.quick_push (init);
4362
4363               number_of_places_left_in_vector = nunits;
4364               elts.new_vector (vector_type, nunits, 1);
4365               elts.quick_grow (nunits);
4366               constant_p = true;
4367             }
4368         }
4369     }
4370
4371   /* Since the vectors are created in the reverse order, we should invert
4372      them.  */
4373   vec_num = voprnds.length ();
4374   for (j = vec_num; j != 0; j--)
4375     {
4376       vop = voprnds[j - 1];
4377       vec_oprnds->quick_push (vop);
4378     }
4379
4380   voprnds.release ();
4381
4382   /* In case that VF is greater than the unrolling factor needed for the SLP
4383      group of stmts, NUMBER_OF_VECTORS to be created is greater than
4384      NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
4385      to replicate the vectors.  */
4386   tree neutral_vec = NULL;
4387   while (number_of_vectors > vec_oprnds->length ())
4388     {
4389       if (neutral_op)
4390         {
4391           if (!neutral_vec)
4392             {
4393               gimple_seq ctor_seq = NULL;
4394               neutral_vec = gimple_build_vector_from_val
4395                 (&ctor_seq, vector_type, neutral_op);
4396               if (ctor_seq != NULL)
4397                 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4398             }
4399           vec_oprnds->quick_push (neutral_vec);
4400         }
4401       else
4402         {
4403           for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
4404             vec_oprnds->quick_push (vop);
4405         }
4406     }
4407 }
4408
4409
4410 /* Function vect_create_epilog_for_reduction
4411
4412    Create code at the loop-epilog to finalize the result of a reduction
4413    computation.
4414
4415    VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4416      reduction statements.
4417    STMT is the scalar reduction stmt that is being vectorized.
4418    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4419      number of elements that we can fit in a vectype (nunits).  In this case
4420      we have to generate more than one vector stmt - i.e - we need to "unroll"
4421      the vector stmt by a factor VF/nunits.  For more details see documentation
4422      in vectorizable_operation.
4423    REDUC_FN is the internal function for the epilog reduction.
4424    REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4425      computation.
4426    REDUC_INDEX is the index of the operand in the right hand side of the
4427      statement that is defined by REDUCTION_PHI.
4428    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4429    SLP_NODE is an SLP node containing a group of reduction statements. The
4430      first one in this group is STMT.
4431    INDUC_VAL is for INTEGER_INDUC_COND_REDUCTION the value to use for the case
4432      when the COND_EXPR is never true in the loop.  For MAX_EXPR, it needs to
4433      be smaller than any value of the IV in the loop, for MIN_EXPR larger than
4434      any value of the IV in the loop.
4435    INDUC_CODE is the code for epilog reduction if INTEGER_INDUC_COND_REDUCTION.
4436    NEUTRAL_OP is the value given by neutral_op_for_slp_reduction; it is
4437      null if this is not an SLP reduction
4438
4439    This function:
4440    1. Creates the reduction def-use cycles: sets the arguments for
4441       REDUCTION_PHIS:
4442       The loop-entry argument is the vectorized initial-value of the reduction.
4443       The loop-latch argument is taken from VECT_DEFS - the vector of partial
4444       sums.
4445    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4446       by calling the function specified by REDUC_FN if available, or by
4447       other means (whole-vector shifts or a scalar loop).
4448       The function also creates a new phi node at the loop exit to preserve
4449       loop-closed form, as illustrated below.
4450
4451      The flow at the entry to this function:
4452
4453         loop:
4454           vec_def = phi <null, null>            # REDUCTION_PHI
4455           VECT_DEF = vector_stmt                # vectorized form of STMT
4456           s_loop = scalar_stmt                  # (scalar) STMT
4457         loop_exit:
4458           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4459           use <s_out0>
4460           use <s_out0>
4461
4462      The above is transformed by this function into:
4463
4464         loop:
4465           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4466           VECT_DEF = vector_stmt                # vectorized form of STMT
4467           s_loop = scalar_stmt                  # (scalar) STMT
4468         loop_exit:
4469           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4470           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4471           v_out2 = reduce <v_out1>
4472           s_out3 = extract_field <v_out2, 0>
4473           s_out4 = adjust_result <s_out3>
4474           use <s_out4>
4475           use <s_out4>
4476 */
4477
4478 static void
4479 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
4480                                   gimple *reduc_def_stmt,
4481                                   int ncopies, internal_fn reduc_fn,
4482                                   vec<gimple *> reduction_phis,
4483                                   bool double_reduc,
4484                                   slp_tree slp_node,
4485                                   slp_instance slp_node_instance,
4486                                   tree induc_val, enum tree_code induc_code,
4487                                   tree neutral_op)
4488 {
4489   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4490   stmt_vec_info prev_phi_info;
4491   tree vectype;
4492   machine_mode mode;
4493   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4494   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4495   basic_block exit_bb;
4496   tree scalar_dest;
4497   tree scalar_type;
4498   gimple *new_phi = NULL, *phi;
4499   gimple_stmt_iterator exit_gsi;
4500   tree vec_dest;
4501   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4502   gimple *epilog_stmt = NULL;
4503   enum tree_code code = gimple_assign_rhs_code (stmt);
4504   gimple *exit_phi;
4505   tree bitsize;
4506   tree adjustment_def = NULL;
4507   tree vec_initial_def = NULL;
4508   tree expr, def, initial_def = NULL;
4509   tree orig_name, scalar_result;
4510   imm_use_iterator imm_iter, phi_imm_iter;
4511   use_operand_p use_p, phi_use_p;
4512   gimple *use_stmt, *orig_stmt, *reduction_phi = NULL;
4513   bool nested_in_vect_loop = false;
4514   auto_vec<gimple *> new_phis;
4515   auto_vec<gimple *> inner_phis;
4516   enum vect_def_type dt = vect_unknown_def_type;
4517   int j, i;
4518   auto_vec<tree> scalar_results;
4519   unsigned int group_size = 1, k, ratio;
4520   auto_vec<tree> vec_initial_defs;
4521   auto_vec<gimple *> phis;
4522   bool slp_reduc = false;
4523   bool direct_slp_reduc;
4524   tree new_phi_result;
4525   gimple *inner_phi = NULL;
4526   tree induction_index = NULL_TREE;
4527
4528   if (slp_node)
4529     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4530
4531   if (nested_in_vect_loop_p (loop, stmt))
4532     {
4533       outer_loop = loop;
4534       loop = loop->inner;
4535       nested_in_vect_loop = true;
4536       gcc_assert (!slp_node);
4537     }
4538
4539   vectype = STMT_VINFO_VECTYPE (stmt_info);
4540   gcc_assert (vectype);
4541   mode = TYPE_MODE (vectype);
4542
4543   /* 1. Create the reduction def-use cycle:
4544      Set the arguments of REDUCTION_PHIS, i.e., transform
4545
4546         loop:
4547           vec_def = phi <null, null>            # REDUCTION_PHI
4548           VECT_DEF = vector_stmt                # vectorized form of STMT
4549           ...
4550
4551      into:
4552
4553         loop:
4554           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4555           VECT_DEF = vector_stmt                # vectorized form of STMT
4556           ...
4557
4558      (in case of SLP, do it for all the phis). */
4559
4560   /* Get the loop-entry arguments.  */
4561   enum vect_def_type initial_def_dt = vect_unknown_def_type;
4562   if (slp_node)
4563     {
4564       unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4565       vec_initial_defs.reserve (vec_num);
4566       get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
4567                                       &vec_initial_defs, vec_num,
4568                                       REDUC_GROUP_FIRST_ELEMENT (stmt_info),
4569                                       neutral_op);
4570     }
4571   else
4572     {
4573       /* Get at the scalar def before the loop, that defines the initial value
4574          of the reduction variable.  */
4575       gimple *def_stmt;
4576       initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4577                                            loop_preheader_edge (loop));
4578       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
4579          and we can't use zero for induc_val, use initial_def.  Similarly
4580          for REDUC_MIN and initial_def larger than the base.  */
4581       if (TREE_CODE (initial_def) == INTEGER_CST
4582           && (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4583               == INTEGER_INDUC_COND_REDUCTION)
4584           && !integer_zerop (induc_val)
4585           && ((induc_code == MAX_EXPR
4586                && tree_int_cst_lt (initial_def, induc_val))
4587               || (induc_code == MIN_EXPR
4588                   && tree_int_cst_lt (induc_val, initial_def))))
4589         induc_val = initial_def;
4590       vect_is_simple_use (initial_def, loop_vinfo, &def_stmt, &initial_def_dt);
4591       vec_initial_def = get_initial_def_for_reduction (stmt, initial_def,
4592                                                        &adjustment_def);
4593       vec_initial_defs.create (1);
4594       vec_initial_defs.quick_push (vec_initial_def);
4595     }
4596
4597   /* Set phi nodes arguments.  */
4598   FOR_EACH_VEC_ELT (reduction_phis, i, phi)
4599     {
4600       tree vec_init_def = vec_initial_defs[i];
4601       tree def = vect_defs[i];
4602       for (j = 0; j < ncopies; j++)
4603         {
4604           if (j != 0)
4605             {
4606               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4607               if (nested_in_vect_loop)
4608                 vec_init_def
4609                   = vect_get_vec_def_for_stmt_copy (initial_def_dt,
4610                                                     vec_init_def);
4611             }
4612
4613           /* Set the loop-entry arg of the reduction-phi.  */
4614
4615           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4616               == INTEGER_INDUC_COND_REDUCTION)
4617             {
4618               /* Initialise the reduction phi to zero.  This prevents initial
4619                  values of non-zero interferring with the reduction op.  */
4620               gcc_assert (ncopies == 1);
4621               gcc_assert (i == 0);
4622
4623               tree vec_init_def_type = TREE_TYPE (vec_init_def);
4624               tree induc_val_vec
4625                 = build_vector_from_val (vec_init_def_type, induc_val);
4626
4627               add_phi_arg (as_a <gphi *> (phi), induc_val_vec,
4628                            loop_preheader_edge (loop), UNKNOWN_LOCATION);
4629             }
4630           else
4631             add_phi_arg (as_a <gphi *> (phi), vec_init_def,
4632                          loop_preheader_edge (loop), UNKNOWN_LOCATION);
4633
4634           /* Set the loop-latch arg for the reduction-phi.  */
4635           if (j > 0)
4636             def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
4637
4638           add_phi_arg (as_a <gphi *> (phi), def, loop_latch_edge (loop),
4639                        UNKNOWN_LOCATION);
4640
4641           if (dump_enabled_p ())
4642             {
4643               dump_printf_loc (MSG_NOTE, vect_location,
4644                                "transform reduction: created def-use cycle: ");
4645               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
4646               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
4647             }
4648         }
4649     }
4650
4651   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4652      which is updated with the current index of the loop for every match of
4653      the original loop's cond_expr (VEC_STMT).  This results in a vector
4654      containing the last time the condition passed for that vector lane.
4655      The first match will be a 1 to allow 0 to be used for non-matching
4656      indexes.  If there are no matches at all then the vector will be all
4657      zeroes.  */
4658   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4659     {
4660       tree indx_before_incr, indx_after_incr;
4661       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4662
4663       gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4664       gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4665
4666       int scalar_precision
4667         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4668       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4669       tree cr_index_vector_type = build_vector_type
4670         (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype));
4671
4672       /* First we create a simple vector induction variable which starts
4673          with the values {1,2,3,...} (SERIES_VECT) and increments by the
4674          vector size (STEP).  */
4675
4676       /* Create a {1,2,3,...} vector.  */
4677       tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
4678
4679       /* Create a vector of the step value.  */
4680       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4681       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4682
4683       /* Create an induction variable.  */
4684       gimple_stmt_iterator incr_gsi;
4685       bool insert_after;
4686       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4687       create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4688                  insert_after, &indx_before_incr, &indx_after_incr);
4689
4690       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4691          filled with zeros (VEC_ZERO).  */
4692
4693       /* Create a vector of 0s.  */
4694       tree zero = build_zero_cst (cr_index_scalar_type);
4695       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4696
4697       /* Create a vector phi node.  */
4698       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4699       new_phi = create_phi_node (new_phi_tree, loop->header);
4700       set_vinfo_for_stmt (new_phi,
4701                           new_stmt_vec_info (new_phi, loop_vinfo));
4702       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4703                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
4704
4705       /* Now take the condition from the loops original cond_expr
4706          (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
4707          every match uses values from the induction variable
4708          (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4709          (NEW_PHI_TREE).
4710          Finally, we update the phi (NEW_PHI_TREE) to take the value of
4711          the new cond_expr (INDEX_COND_EXPR).  */
4712
4713       /* Duplicate the condition from vec_stmt.  */
4714       tree ccompare = unshare_expr (gimple_assign_rhs1 (vec_stmt));
4715
4716       /* Create a conditional, where the condition is taken from vec_stmt
4717          (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and
4718          else is the phi (NEW_PHI_TREE).  */
4719       tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
4720                                      ccompare, indx_before_incr,
4721                                      new_phi_tree);
4722       induction_index = make_ssa_name (cr_index_vector_type);
4723       gimple *index_condition = gimple_build_assign (induction_index,
4724                                                      index_cond_expr);
4725       gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
4726       stmt_vec_info index_vec_info = new_stmt_vec_info (index_condition,
4727                                                         loop_vinfo);
4728       STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
4729       set_vinfo_for_stmt (index_condition, index_vec_info);
4730
4731       /* Update the phi with the vec cond.  */
4732       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4733                    loop_latch_edge (loop), UNKNOWN_LOCATION);
4734     }
4735
4736   /* 2. Create epilog code.
4737         The reduction epilog code operates across the elements of the vector
4738         of partial results computed by the vectorized loop.
4739         The reduction epilog code consists of:
4740
4741         step 1: compute the scalar result in a vector (v_out2)
4742         step 2: extract the scalar result (s_out3) from the vector (v_out2)
4743         step 3: adjust the scalar result (s_out3) if needed.
4744
4745         Step 1 can be accomplished using one the following three schemes:
4746           (scheme 1) using reduc_fn, if available.
4747           (scheme 2) using whole-vector shifts, if available.
4748           (scheme 3) using a scalar loop. In this case steps 1+2 above are
4749                      combined.
4750
4751           The overall epilog code looks like this:
4752
4753           s_out0 = phi <s_loop>         # original EXIT_PHI
4754           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
4755           v_out2 = reduce <v_out1>              # step 1
4756           s_out3 = extract_field <v_out2, 0>    # step 2
4757           s_out4 = adjust_result <s_out3>       # step 3
4758
4759           (step 3 is optional, and steps 1 and 2 may be combined).
4760           Lastly, the uses of s_out0 are replaced by s_out4.  */
4761
4762
4763   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4764          v_out1 = phi <VECT_DEF>
4765          Store them in NEW_PHIS.  */
4766
4767   exit_bb = single_exit (loop)->dest;
4768   prev_phi_info = NULL;
4769   new_phis.create (vect_defs.length ());
4770   FOR_EACH_VEC_ELT (vect_defs, i, def)
4771     {
4772       for (j = 0; j < ncopies; j++)
4773         {
4774           tree new_def = copy_ssa_name (def);
4775           phi = create_phi_node (new_def, exit_bb);
4776           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo));
4777           if (j == 0)
4778             new_phis.quick_push (phi);
4779           else
4780             {
4781               def = vect_get_vec_def_for_stmt_copy (dt, def);
4782               STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
4783             }
4784
4785           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4786           prev_phi_info = vinfo_for_stmt (phi);
4787         }
4788     }
4789
4790   /* The epilogue is created for the outer-loop, i.e., for the loop being
4791      vectorized.  Create exit phis for the outer loop.  */
4792   if (double_reduc)
4793     {
4794       loop = outer_loop;
4795       exit_bb = single_exit (loop)->dest;
4796       inner_phis.create (vect_defs.length ());
4797       FOR_EACH_VEC_ELT (new_phis, i, phi)
4798         {
4799           tree new_result = copy_ssa_name (PHI_RESULT (phi));
4800           gphi *outer_phi = create_phi_node (new_result, exit_bb);
4801           SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4802                            PHI_RESULT (phi));
4803           set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4804                                                             loop_vinfo));
4805           inner_phis.quick_push (phi);
4806           new_phis[i] = outer_phi;
4807           prev_phi_info = vinfo_for_stmt (outer_phi);
4808           while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
4809             {
4810               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4811               new_result = copy_ssa_name (PHI_RESULT (phi));
4812               outer_phi = create_phi_node (new_result, exit_bb);
4813               SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4814                                PHI_RESULT (phi));
4815               set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4816                                                                 loop_vinfo));
4817               STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
4818               prev_phi_info = vinfo_for_stmt (outer_phi);
4819             }
4820         }
4821     }
4822
4823   exit_gsi = gsi_after_labels (exit_bb);
4824
4825   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4826          (i.e. when reduc_fn is not available) and in the final adjustment
4827          code (if needed).  Also get the original scalar reduction variable as
4828          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
4829          represents a reduction pattern), the tree-code and scalar-def are
4830          taken from the original stmt that the pattern-stmt (STMT) replaces.
4831          Otherwise (it is a regular reduction) - the tree-code and scalar-def
4832          are taken from STMT.  */
4833
4834   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4835   if (!orig_stmt)
4836     {
4837       /* Regular reduction  */
4838       orig_stmt = stmt;
4839     }
4840   else
4841     {
4842       /* Reduction pattern  */
4843       stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
4844       gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
4845       gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
4846     }
4847
4848   code = gimple_assign_rhs_code (orig_stmt);
4849   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4850      partial results are added and not subtracted.  */
4851   if (code == MINUS_EXPR)
4852     code = PLUS_EXPR;
4853
4854   scalar_dest = gimple_assign_lhs (orig_stmt);
4855   scalar_type = TREE_TYPE (scalar_dest);
4856   scalar_results.create (group_size);
4857   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4858   bitsize = TYPE_SIZE (scalar_type);
4859
4860   /* In case this is a reduction in an inner-loop while vectorizing an outer
4861      loop - we don't need to extract a single scalar result at the end of the
4862      inner-loop (unless it is double reduction, i.e., the use of reduction is
4863      outside the outer-loop).  The final vector of partial results will be used
4864      in the vectorized outer-loop, or reduced to a scalar result at the end of
4865      the outer-loop.  */
4866   if (nested_in_vect_loop && !double_reduc)
4867     goto vect_finalize_reduction;
4868
4869   /* SLP reduction without reduction chain, e.g.,
4870      # a1 = phi <a2, a0>
4871      # b1 = phi <b2, b0>
4872      a2 = operation (a1)
4873      b2 = operation (b1)  */
4874   slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
4875
4876   /* True if we should implement SLP_REDUC using native reduction operations
4877      instead of scalar operations.  */
4878   direct_slp_reduc = (reduc_fn != IFN_LAST
4879                       && slp_reduc
4880                       && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
4881
4882   /* In case of reduction chain, e.g.,
4883      # a1 = phi <a3, a0>
4884      a2 = operation (a1)
4885      a3 = operation (a2),
4886
4887      we may end up with more than one vector result.  Here we reduce them to
4888      one vector.  */
4889   if (REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) || direct_slp_reduc)
4890     {
4891       tree first_vect = PHI_RESULT (new_phis[0]);
4892       gassign *new_vec_stmt = NULL;
4893       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4894       for (k = 1; k < new_phis.length (); k++)
4895         {
4896           gimple *next_phi = new_phis[k];
4897           tree second_vect = PHI_RESULT (next_phi);
4898           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4899           new_vec_stmt = gimple_build_assign (tem, code,
4900                                               first_vect, second_vect);
4901           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4902           first_vect = tem;
4903         }
4904
4905       new_phi_result = first_vect;
4906       if (new_vec_stmt)
4907         {
4908           new_phis.truncate (0);
4909           new_phis.safe_push (new_vec_stmt);
4910         }
4911     }
4912   /* Likewise if we couldn't use a single defuse cycle.  */
4913   else if (ncopies > 1)
4914     {
4915       gcc_assert (new_phis.length () == 1);
4916       tree first_vect = PHI_RESULT (new_phis[0]);
4917       gassign *new_vec_stmt = NULL;
4918       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4919       gimple *next_phi = new_phis[0];
4920       for (int k = 1; k < ncopies; ++k)
4921         {
4922           next_phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next_phi));
4923           tree second_vect = PHI_RESULT (next_phi);
4924           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4925           new_vec_stmt = gimple_build_assign (tem, code,
4926                                               first_vect, second_vect);
4927           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4928           first_vect = tem;
4929         }
4930       new_phi_result = first_vect;
4931       new_phis.truncate (0);
4932       new_phis.safe_push (new_vec_stmt);
4933     }
4934   else
4935     new_phi_result = PHI_RESULT (new_phis[0]);
4936
4937   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4938       && reduc_fn != IFN_LAST)
4939     {
4940       /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4941          various data values where the condition matched and another vector
4942          (INDUCTION_INDEX) containing all the indexes of those matches.  We
4943          need to extract the last matching index (which will be the index with
4944          highest value) and use this to index into the data vector.
4945          For the case where there were no matches, the data vector will contain
4946          all default values and the index vector will be all zeros.  */
4947
4948       /* Get various versions of the type of the vector of indexes.  */
4949       tree index_vec_type = TREE_TYPE (induction_index);
4950       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4951       tree index_scalar_type = TREE_TYPE (index_vec_type);
4952       tree index_vec_cmp_type = build_same_sized_truth_vector_type
4953         (index_vec_type);
4954
4955       /* Get an unsigned integer version of the type of the data vector.  */
4956       int scalar_precision
4957         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
4958       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4959       tree vectype_unsigned = build_vector_type
4960         (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4961
4962       /* First we need to create a vector (ZERO_VEC) of zeros and another
4963          vector (MAX_INDEX_VEC) filled with the last matching index, which we
4964          can create using a MAX reduction and then expanding.
4965          In the case where the loop never made any matches, the max index will
4966          be zero.  */
4967
4968       /* Vector of {0, 0, 0,...}.  */
4969       tree zero_vec = make_ssa_name (vectype);
4970       tree zero_vec_rhs = build_zero_cst (vectype);
4971       gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
4972       gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
4973
4974       /* Find maximum value from the vector of found indexes.  */
4975       tree max_index = make_ssa_name (index_scalar_type);
4976       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4977                                                           1, induction_index);
4978       gimple_call_set_lhs (max_index_stmt, max_index);
4979       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
4980
4981       /* Vector of {max_index, max_index, max_index,...}.  */
4982       tree max_index_vec = make_ssa_name (index_vec_type);
4983       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
4984                                                       max_index);
4985       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
4986                                                         max_index_vec_rhs);
4987       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
4988
4989       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
4990          with the vector (INDUCTION_INDEX) of found indexes, choosing values
4991          from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
4992          otherwise.  Only one value should match, resulting in a vector
4993          (VEC_COND) with one data value and the rest zeros.
4994          In the case where the loop never made any matches, every index will
4995          match, resulting in a vector with all data values (which will all be
4996          the default value).  */
4997
4998       /* Compare the max index vector to the vector of found indexes to find
4999          the position of the max value.  */
5000       tree vec_compare = make_ssa_name (index_vec_cmp_type);
5001       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
5002                                                       induction_index,
5003                                                       max_index_vec);
5004       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
5005
5006       /* Use the compare to choose either values from the data vector or
5007          zero.  */
5008       tree vec_cond = make_ssa_name (vectype);
5009       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
5010                                                    vec_compare, new_phi_result,
5011                                                    zero_vec);
5012       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
5013
5014       /* Finally we need to extract the data value from the vector (VEC_COND)
5015          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
5016          reduction, but because this doesn't exist, we can use a MAX reduction
5017          instead.  The data value might be signed or a float so we need to cast
5018          it first.
5019          In the case where the loop never made any matches, the data values are
5020          all identical, and so will reduce down correctly.  */
5021
5022       /* Make the matched data values unsigned.  */
5023       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
5024       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
5025                                        vec_cond);
5026       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
5027                                                         VIEW_CONVERT_EXPR,
5028                                                         vec_cond_cast_rhs);
5029       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
5030
5031       /* Reduce down to a scalar value.  */
5032       tree data_reduc = make_ssa_name (scalar_type_unsigned);
5033       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5034                                                            1, vec_cond_cast);
5035       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
5036       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
5037
5038       /* Convert the reduced value back to the result type and set as the
5039          result.  */
5040       gimple_seq stmts = NULL;
5041       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
5042                                data_reduc);
5043       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5044       scalar_results.safe_push (new_temp);
5045     }
5046   else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
5047            && reduc_fn == IFN_LAST)
5048     {
5049       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
5050          idx = 0;
5051          idx_val = induction_index[0];
5052          val = data_reduc[0];
5053          for (idx = 0, val = init, i = 0; i < nelts; ++i)
5054            if (induction_index[i] > idx_val)
5055              val = data_reduc[i], idx_val = induction_index[i];
5056          return val;  */
5057
5058       tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
5059       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5060       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5061       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5062       /* Enforced by vectorizable_reduction, which ensures we have target
5063          support before allowing a conditional reduction on variable-length
5064          vectors.  */
5065       unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5066       tree idx_val = NULL_TREE, val = NULL_TREE;
5067       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5068         {
5069           tree old_idx_val = idx_val;
5070           tree old_val = val;
5071           idx_val = make_ssa_name (idx_eltype);
5072           epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5073                                              build3 (BIT_FIELD_REF, idx_eltype,
5074                                                      induction_index,
5075                                                      bitsize_int (el_size),
5076                                                      bitsize_int (off)));
5077           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5078           val = make_ssa_name (data_eltype);
5079           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5080                                              build3 (BIT_FIELD_REF,
5081                                                      data_eltype,
5082                                                      new_phi_result,
5083                                                      bitsize_int (el_size),
5084                                                      bitsize_int (off)));
5085           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5086           if (off != 0)
5087             {
5088               tree new_idx_val = idx_val;
5089               tree new_val = val;
5090               if (off != v_size - el_size)
5091                 {
5092                   new_idx_val = make_ssa_name (idx_eltype);
5093                   epilog_stmt = gimple_build_assign (new_idx_val,
5094                                                      MAX_EXPR, idx_val,
5095                                                      old_idx_val);
5096                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5097                 }
5098               new_val = make_ssa_name (data_eltype);
5099               epilog_stmt = gimple_build_assign (new_val,
5100                                                  COND_EXPR,
5101                                                  build2 (GT_EXPR,
5102                                                          boolean_type_node,
5103                                                          idx_val,
5104                                                          old_idx_val),
5105                                                  val, old_val);
5106               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5107               idx_val = new_idx_val;
5108               val = new_val;
5109             }
5110         }
5111       /* Convert the reduced value back to the result type and set as the
5112          result.  */
5113       gimple_seq stmts = NULL;
5114       val = gimple_convert (&stmts, scalar_type, val);
5115       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5116       scalar_results.safe_push (val);
5117     }
5118
5119   /* 2.3 Create the reduction code, using one of the three schemes described
5120          above. In SLP we simply need to extract all the elements from the
5121          vector (without reducing them), so we use scalar shifts.  */
5122   else if (reduc_fn != IFN_LAST && !slp_reduc)
5123     {
5124       tree tmp;
5125       tree vec_elem_type;
5126
5127       /* Case 1:  Create:
5128          v_out2 = reduc_expr <v_out1>  */
5129
5130       if (dump_enabled_p ())
5131         dump_printf_loc (MSG_NOTE, vect_location,
5132                          "Reduce using direct vector reduction.\n");
5133
5134       vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
5135       if (!useless_type_conversion_p (scalar_type, vec_elem_type))
5136         {
5137           tree tmp_dest
5138             = vect_create_destination_var (scalar_dest, vec_elem_type);
5139           epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5140                                                     new_phi_result);
5141           gimple_set_lhs (epilog_stmt, tmp_dest);
5142           new_temp = make_ssa_name (tmp_dest, epilog_stmt);
5143           gimple_set_lhs (epilog_stmt, new_temp);
5144           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5145
5146           epilog_stmt = gimple_build_assign (new_scalar_dest, NOP_EXPR,
5147                                              new_temp);
5148         }
5149       else
5150         {
5151           epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5152                                                     new_phi_result);
5153           gimple_set_lhs (epilog_stmt, new_scalar_dest);
5154         }
5155
5156       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5157       gimple_set_lhs (epilog_stmt, new_temp);
5158       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5159
5160       if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5161            == INTEGER_INDUC_COND_REDUCTION)
5162           && !operand_equal_p (initial_def, induc_val, 0))
5163         {
5164           /* Earlier we set the initial value to be a vector if induc_val
5165              values.  Check the result and if it is induc_val then replace
5166              with the original initial value, unless induc_val is
5167              the same as initial_def already.  */
5168           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5169                                   induc_val);
5170
5171           tmp = make_ssa_name (new_scalar_dest);
5172           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5173                                              initial_def, new_temp);
5174           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5175           new_temp = tmp;
5176         }
5177
5178       scalar_results.safe_push (new_temp);
5179     }
5180   else if (direct_slp_reduc)
5181     {
5182       /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5183          with the elements for other SLP statements replaced with the
5184          neutral value.  We can then do a normal reduction on each vector.  */
5185
5186       /* Enforced by vectorizable_reduction.  */
5187       gcc_assert (new_phis.length () == 1);
5188       gcc_assert (pow2p_hwi (group_size));
5189
5190       slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
5191       vec<gimple *> orig_phis = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
5192       gimple_seq seq = NULL;
5193
5194       /* Build a vector {0, 1, 2, ...}, with the same number of elements
5195          and the same element size as VECTYPE.  */
5196       tree index = build_index_vector (vectype, 0, 1);
5197       tree index_type = TREE_TYPE (index);
5198       tree index_elt_type = TREE_TYPE (index_type);
5199       tree mask_type = build_same_sized_truth_vector_type (index_type);
5200
5201       /* Create a vector that, for each element, identifies which of
5202          the REDUC_GROUP_SIZE results should use it.  */
5203       tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5204       index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5205                             build_vector_from_val (index_type, index_mask));
5206
5207       /* Get a neutral vector value.  This is simply a splat of the neutral
5208          scalar value if we have one, otherwise the initial scalar value
5209          is itself a neutral value.  */
5210       tree vector_identity = NULL_TREE;
5211       if (neutral_op)
5212         vector_identity = gimple_build_vector_from_val (&seq, vectype,
5213                                                         neutral_op);
5214       for (unsigned int i = 0; i < group_size; ++i)
5215         {
5216           /* If there's no univeral neutral value, we can use the
5217              initial scalar value from the original PHI.  This is used
5218              for MIN and MAX reduction, for example.  */
5219           if (!neutral_op)
5220             {
5221               tree scalar_value
5222                 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i],
5223                                          loop_preheader_edge (loop));
5224               vector_identity = gimple_build_vector_from_val (&seq, vectype,
5225                                                               scalar_value);
5226             }
5227
5228           /* Calculate the equivalent of:
5229
5230              sel[j] = (index[j] == i);
5231
5232              which selects the elements of NEW_PHI_RESULT that should
5233              be included in the result.  */
5234           tree compare_val = build_int_cst (index_elt_type, i);
5235           compare_val = build_vector_from_val (index_type, compare_val);
5236           tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5237                                    index, compare_val);
5238
5239           /* Calculate the equivalent of:
5240
5241              vec = seq ? new_phi_result : vector_identity;
5242
5243              VEC is now suitable for a full vector reduction.  */
5244           tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5245                                    sel, new_phi_result, vector_identity);
5246
5247           /* Do the reduction and convert it to the appropriate type.  */
5248           tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5249                                       TREE_TYPE (vectype), vec);
5250           scalar = gimple_convert (&seq, scalar_type, scalar);
5251           scalar_results.safe_push (scalar);
5252         }
5253       gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5254     }
5255   else
5256     {
5257       bool reduce_with_shift;
5258       tree vec_temp;
5259
5260       /* COND reductions all do the final reduction with MAX_EXPR
5261          or MIN_EXPR.  */
5262       if (code == COND_EXPR)
5263         {
5264           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5265               == INTEGER_INDUC_COND_REDUCTION)
5266             code = induc_code;
5267           else
5268             code = MAX_EXPR;
5269         }
5270
5271       /* See if the target wants to do the final (shift) reduction
5272          in a vector mode of smaller size and first reduce upper/lower
5273          halves against each other.  */
5274       enum machine_mode mode1 = mode;
5275       tree vectype1 = vectype;
5276       unsigned sz = tree_to_uhwi (TYPE_SIZE_UNIT (vectype));
5277       unsigned sz1 = sz;
5278       if (!slp_reduc
5279           && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5280         sz1 = GET_MODE_SIZE (mode1).to_constant ();
5281
5282       vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz1);
5283       reduce_with_shift = have_whole_vector_shift (mode1);
5284       if (!VECTOR_MODE_P (mode1))
5285         reduce_with_shift = false;
5286       else
5287         {
5288           optab optab = optab_for_tree_code (code, vectype1, optab_default);
5289           if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5290             reduce_with_shift = false;
5291         }
5292
5293       /* First reduce the vector to the desired vector size we should
5294          do shift reduction on by combining upper and lower halves.  */
5295       new_temp = new_phi_result;
5296       while (sz > sz1)
5297         {
5298           gcc_assert (!slp_reduc);
5299           sz /= 2;
5300           vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz);
5301
5302           /* The target has to make sure we support lowpart/highpart
5303              extraction, either via direct vector extract or through
5304              an integer mode punning.  */
5305           tree dst1, dst2;
5306           if (convert_optab_handler (vec_extract_optab,
5307                                      TYPE_MODE (TREE_TYPE (new_temp)),
5308                                      TYPE_MODE (vectype1))
5309               != CODE_FOR_nothing)
5310             {
5311               /* Extract sub-vectors directly once vec_extract becomes
5312                  a conversion optab.  */
5313               dst1 = make_ssa_name (vectype1);
5314               epilog_stmt
5315                   = gimple_build_assign (dst1, BIT_FIELD_REF,
5316                                          build3 (BIT_FIELD_REF, vectype1,
5317                                                  new_temp, TYPE_SIZE (vectype1),
5318                                                  bitsize_int (0)));
5319               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5320               dst2 =  make_ssa_name (vectype1);
5321               epilog_stmt
5322                   = gimple_build_assign (dst2, BIT_FIELD_REF,
5323                                          build3 (BIT_FIELD_REF, vectype1,
5324                                                  new_temp, TYPE_SIZE (vectype1),
5325                                                  bitsize_int (sz * BITS_PER_UNIT)));
5326               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5327             }
5328           else
5329             {
5330               /* Extract via punning to appropriately sized integer mode
5331                  vector.  */
5332               tree eltype = build_nonstandard_integer_type (sz * BITS_PER_UNIT,
5333                                                             1);
5334               tree etype = build_vector_type (eltype, 2);
5335               gcc_assert (convert_optab_handler (vec_extract_optab,
5336                                                  TYPE_MODE (etype),
5337                                                  TYPE_MODE (eltype))
5338                           != CODE_FOR_nothing);
5339               tree tem = make_ssa_name (etype);
5340               epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5341                                                  build1 (VIEW_CONVERT_EXPR,
5342                                                          etype, new_temp));
5343               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5344               new_temp = tem;
5345               tem = make_ssa_name (eltype);
5346               epilog_stmt
5347                   = gimple_build_assign (tem, BIT_FIELD_REF,
5348                                          build3 (BIT_FIELD_REF, eltype,
5349                                                  new_temp, TYPE_SIZE (eltype),
5350                                                  bitsize_int (0)));
5351               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5352               dst1 = make_ssa_name (vectype1);
5353               epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5354                                                  build1 (VIEW_CONVERT_EXPR,
5355                                                          vectype1, tem));
5356               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5357               tem = make_ssa_name (eltype);
5358               epilog_stmt
5359                   = gimple_build_assign (tem, BIT_FIELD_REF,
5360                                          build3 (BIT_FIELD_REF, eltype,
5361                                                  new_temp, TYPE_SIZE (eltype),
5362                                                  bitsize_int (sz * BITS_PER_UNIT)));
5363               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5364               dst2 =  make_ssa_name (vectype1);
5365               epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5366                                                  build1 (VIEW_CONVERT_EXPR,
5367                                                          vectype1, tem));
5368               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5369             }
5370
5371           new_temp = make_ssa_name (vectype1);
5372           epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5373           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5374         }
5375
5376       if (reduce_with_shift && !slp_reduc)
5377         {
5378           int element_bitsize = tree_to_uhwi (bitsize);
5379           /* Enforced by vectorizable_reduction, which disallows SLP reductions
5380              for variable-length vectors and also requires direct target support
5381              for loop reductions.  */
5382           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5383           int nelements = vec_size_in_bits / element_bitsize;
5384           vec_perm_builder sel;
5385           vec_perm_indices indices;
5386
5387           int elt_offset;
5388
5389           tree zero_vec = build_zero_cst (vectype1);
5390           /* Case 2: Create:
5391              for (offset = nelements/2; offset >= 1; offset/=2)
5392                 {
5393                   Create:  va' = vec_shift <va, offset>
5394                   Create:  va = vop <va, va'>
5395                 }  */
5396
5397           tree rhs;
5398
5399           if (dump_enabled_p ())
5400             dump_printf_loc (MSG_NOTE, vect_location,
5401                              "Reduce using vector shifts\n");
5402
5403           mode1 = TYPE_MODE (vectype1);
5404           vec_dest = vect_create_destination_var (scalar_dest, vectype1);
5405           for (elt_offset = nelements / 2;
5406                elt_offset >= 1;
5407                elt_offset /= 2)
5408             {
5409               calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5410               indices.new_vector (sel, 2, nelements);
5411               tree mask = vect_gen_perm_mask_any (vectype1, indices);
5412               epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
5413                                                  new_temp, zero_vec, mask);
5414               new_name = make_ssa_name (vec_dest, epilog_stmt);
5415               gimple_assign_set_lhs (epilog_stmt, new_name);
5416               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5417
5418               epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
5419                                                  new_temp);
5420               new_temp = make_ssa_name (vec_dest, epilog_stmt);
5421               gimple_assign_set_lhs (epilog_stmt, new_temp);
5422               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5423             }
5424
5425           /* 2.4  Extract the final scalar result.  Create:
5426              s_out3 = extract_field <v_out2, bitpos>  */
5427
5428           if (dump_enabled_p ())
5429             dump_printf_loc (MSG_NOTE, vect_location,
5430                              "extract scalar result\n");
5431
5432           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5433                         bitsize, bitsize_zero_node);
5434           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5435           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5436           gimple_assign_set_lhs (epilog_stmt, new_temp);
5437           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5438           scalar_results.safe_push (new_temp);
5439         }
5440       else
5441         {
5442           /* Case 3: Create:
5443              s = extract_field <v_out2, 0>
5444              for (offset = element_size;
5445                   offset < vector_size;
5446                   offset += element_size;)
5447                {
5448                  Create:  s' = extract_field <v_out2, offset>
5449                  Create:  s = op <s, s'>  // For non SLP cases
5450                }  */
5451
5452           if (dump_enabled_p ())
5453             dump_printf_loc (MSG_NOTE, vect_location,
5454                              "Reduce using scalar code.\n");
5455
5456           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5457           int element_bitsize = tree_to_uhwi (bitsize);
5458           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5459             {
5460               int bit_offset;
5461               if (gimple_code (new_phi) == GIMPLE_PHI)
5462                 vec_temp = PHI_RESULT (new_phi);
5463               else
5464                 vec_temp = gimple_assign_lhs (new_phi);
5465               tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
5466                                  bitsize_zero_node);
5467               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5468               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5469               gimple_assign_set_lhs (epilog_stmt, new_temp);
5470               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5471
5472               /* In SLP we don't need to apply reduction operation, so we just
5473                  collect s' values in SCALAR_RESULTS.  */
5474               if (slp_reduc)
5475                 scalar_results.safe_push (new_temp);
5476
5477               for (bit_offset = element_bitsize;
5478                    bit_offset < vec_size_in_bits;
5479                    bit_offset += element_bitsize)
5480                 {
5481                   tree bitpos = bitsize_int (bit_offset);
5482                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
5483                                      bitsize, bitpos);
5484
5485                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5486                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
5487                   gimple_assign_set_lhs (epilog_stmt, new_name);
5488                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5489
5490                   if (slp_reduc)
5491                     {
5492                       /* In SLP we don't need to apply reduction operation, so
5493                          we just collect s' values in SCALAR_RESULTS.  */
5494                       new_temp = new_name;
5495                       scalar_results.safe_push (new_name);
5496                     }
5497                   else
5498                     {
5499                       epilog_stmt = gimple_build_assign (new_scalar_dest, code,
5500                                                          new_name, new_temp);
5501                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5502                       gimple_assign_set_lhs (epilog_stmt, new_temp);
5503                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5504                     }
5505                 }
5506             }
5507
5508           /* The only case where we need to reduce scalar results in SLP, is
5509              unrolling.  If the size of SCALAR_RESULTS is greater than
5510              REDUC_GROUP_SIZE, we reduce them combining elements modulo
5511              REDUC_GROUP_SIZE.  */
5512           if (slp_reduc)
5513             {
5514               tree res, first_res, new_res;
5515               gimple *new_stmt;
5516
5517               /* Reduce multiple scalar results in case of SLP unrolling.  */
5518               for (j = group_size; scalar_results.iterate (j, &res);
5519                    j++)
5520                 {
5521                   first_res = scalar_results[j % group_size];
5522                   new_stmt = gimple_build_assign (new_scalar_dest, code,
5523                                                   first_res, res);
5524                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
5525                   gimple_assign_set_lhs (new_stmt, new_res);
5526                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
5527                   scalar_results[j % group_size] = new_res;
5528                 }
5529             }
5530           else
5531             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
5532             scalar_results.safe_push (new_temp);
5533         }
5534
5535       if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5536            == INTEGER_INDUC_COND_REDUCTION)
5537           && !operand_equal_p (initial_def, induc_val, 0))
5538         {
5539           /* Earlier we set the initial value to be a vector if induc_val
5540              values.  Check the result and if it is induc_val then replace
5541              with the original initial value, unless induc_val is
5542              the same as initial_def already.  */
5543           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5544                                   induc_val);
5545
5546           tree tmp = make_ssa_name (new_scalar_dest);
5547           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5548                                              initial_def, new_temp);
5549           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5550           scalar_results[0] = tmp;
5551         }
5552     }
5553
5554 vect_finalize_reduction:
5555
5556   if (double_reduc)
5557     loop = loop->inner;
5558
5559   /* 2.5 Adjust the final result by the initial value of the reduction
5560          variable. (When such adjustment is not needed, then
5561          'adjustment_def' is zero).  For example, if code is PLUS we create:
5562          new_temp = loop_exit_def + adjustment_def  */
5563
5564   if (adjustment_def)
5565     {
5566       gcc_assert (!slp_reduc);
5567       if (nested_in_vect_loop)
5568         {
5569           new_phi = new_phis[0];
5570           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
5571           expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
5572           new_dest = vect_create_destination_var (scalar_dest, vectype);
5573         }
5574       else
5575         {
5576           new_temp = scalar_results[0];
5577           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5578           expr = build2 (code, scalar_type, new_temp, adjustment_def);
5579           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
5580         }
5581
5582       epilog_stmt = gimple_build_assign (new_dest, expr);
5583       new_temp = make_ssa_name (new_dest, epilog_stmt);
5584       gimple_assign_set_lhs (epilog_stmt, new_temp);
5585       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5586       if (nested_in_vect_loop)
5587         {
5588           set_vinfo_for_stmt (epilog_stmt,
5589                               new_stmt_vec_info (epilog_stmt, loop_vinfo));
5590           STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
5591                 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
5592
5593           if (!double_reduc)
5594             scalar_results.quick_push (new_temp);
5595           else
5596             scalar_results[0] = new_temp;
5597         }
5598       else
5599         scalar_results[0] = new_temp;
5600
5601       new_phis[0] = epilog_stmt;
5602     }
5603
5604   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
5605           phis with new adjusted scalar results, i.e., replace use <s_out0>
5606           with use <s_out4>.
5607
5608      Transform:
5609         loop_exit:
5610           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5611           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5612           v_out2 = reduce <v_out1>
5613           s_out3 = extract_field <v_out2, 0>
5614           s_out4 = adjust_result <s_out3>
5615           use <s_out0>
5616           use <s_out0>
5617
5618      into:
5619
5620         loop_exit:
5621           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5622           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5623           v_out2 = reduce <v_out1>
5624           s_out3 = extract_field <v_out2, 0>
5625           s_out4 = adjust_result <s_out3>
5626           use <s_out4>
5627           use <s_out4> */
5628
5629
5630   /* In SLP reduction chain we reduce vector results into one vector if
5631      necessary, hence we set here REDUC_GROUP_SIZE to 1.  SCALAR_DEST is the
5632      LHS of the last stmt in the reduction chain, since we are looking for
5633      the loop exit phi node.  */
5634   if (REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
5635     {
5636       gimple *dest_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5637       /* Handle reduction patterns.  */
5638       if (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt)))
5639         dest_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt));
5640
5641       scalar_dest = gimple_assign_lhs (dest_stmt);
5642       group_size = 1;
5643     }
5644
5645   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5646      case that REDUC_GROUP_SIZE is greater than vectorization factor).
5647      Therefore, we need to match SCALAR_RESULTS with corresponding statements.
5648      The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results
5649      correspond to the first vector stmt, etc.
5650      (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)).  */
5651   if (group_size > new_phis.length ())
5652     {
5653       ratio = group_size / new_phis.length ();
5654       gcc_assert (!(group_size % new_phis.length ()));
5655     }
5656   else
5657     ratio = 1;
5658
5659   for (k = 0; k < group_size; k++)
5660     {
5661       if (k % ratio == 0)
5662         {
5663           epilog_stmt = new_phis[k / ratio];
5664           reduction_phi = reduction_phis[k / ratio];
5665           if (double_reduc)
5666             inner_phi = inner_phis[k / ratio];
5667         }
5668
5669       if (slp_reduc)
5670         {
5671           gimple *current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5672
5673           orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
5674           /* SLP statements can't participate in patterns.  */
5675           gcc_assert (!orig_stmt);
5676           scalar_dest = gimple_assign_lhs (current_stmt);
5677         }
5678
5679       phis.create (3);
5680       /* Find the loop-closed-use at the loop exit of the original scalar
5681          result.  (The reduction result is expected to have two immediate uses -
5682          one at the latch block, and one at the loop exit).  */
5683       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5684         if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
5685             && !is_gimple_debug (USE_STMT (use_p)))
5686           phis.safe_push (USE_STMT (use_p));
5687
5688       /* While we expect to have found an exit_phi because of loop-closed-ssa
5689          form we can end up without one if the scalar cycle is dead.  */
5690
5691       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5692         {
5693           if (outer_loop)
5694             {
5695               stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
5696               gphi *vect_phi;
5697
5698               /* FORNOW. Currently not supporting the case that an inner-loop
5699                  reduction is not used in the outer-loop (but only outside the
5700                  outer-loop), unless it is double reduction.  */
5701               gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5702                            && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
5703                           || double_reduc);
5704
5705               if (double_reduc)
5706                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
5707               else
5708                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
5709               if (!double_reduc
5710                   || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
5711                       != vect_double_reduction_def)
5712                 continue;
5713
5714               /* Handle double reduction:
5715
5716                  stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
5717                  stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
5718                  stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
5719                  stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
5720
5721                  At that point the regular reduction (stmt2 and stmt3) is
5722                  already vectorized, as well as the exit phi node, stmt4.
5723                  Here we vectorize the phi node of double reduction, stmt1, and
5724                  update all relevant statements.  */
5725
5726               /* Go through all the uses of s2 to find double reduction phi
5727                  node, i.e., stmt1 above.  */
5728               orig_name = PHI_RESULT (exit_phi);
5729               FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5730                 {
5731                   stmt_vec_info use_stmt_vinfo;
5732                   stmt_vec_info new_phi_vinfo;
5733                   tree vect_phi_init, preheader_arg, vect_phi_res;
5734                   basic_block bb = gimple_bb (use_stmt);
5735                   gimple *use;
5736
5737                   /* Check that USE_STMT is really double reduction phi
5738                      node.  */
5739                   if (gimple_code (use_stmt) != GIMPLE_PHI
5740                       || gimple_phi_num_args (use_stmt) != 2
5741                       || bb->loop_father != outer_loop)
5742                     continue;
5743                   use_stmt_vinfo = vinfo_for_stmt (use_stmt);
5744                   if (!use_stmt_vinfo
5745                       || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
5746                           != vect_double_reduction_def)
5747                     continue;
5748
5749                   /* Create vector phi node for double reduction:
5750                      vs1 = phi <vs0, vs2>
5751                      vs1 was created previously in this function by a call to
5752                        vect_get_vec_def_for_operand and is stored in
5753                        vec_initial_def;
5754                      vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
5755                      vs0 is created here.  */
5756
5757                   /* Create vector phi node.  */
5758                   vect_phi = create_phi_node (vec_initial_def, bb);
5759                   new_phi_vinfo = new_stmt_vec_info (vect_phi,
5760                                     loop_vec_info_for_loop (outer_loop));
5761                   set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
5762
5763                   /* Create vs0 - initial def of the double reduction phi.  */
5764                   preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
5765                                              loop_preheader_edge (outer_loop));
5766                   vect_phi_init = get_initial_def_for_reduction
5767                     (stmt, preheader_arg, NULL);
5768
5769                   /* Update phi node arguments with vs0 and vs2.  */
5770                   add_phi_arg (vect_phi, vect_phi_init,
5771                                loop_preheader_edge (outer_loop),
5772                                UNKNOWN_LOCATION);
5773                   add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
5774                                loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
5775                   if (dump_enabled_p ())
5776                     {
5777                       dump_printf_loc (MSG_NOTE, vect_location,
5778                                        "created double reduction phi node: ");
5779                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
5780                     }
5781
5782                   vect_phi_res = PHI_RESULT (vect_phi);
5783
5784                   /* Replace the use, i.e., set the correct vs1 in the regular
5785                      reduction phi node.  FORNOW, NCOPIES is always 1, so the
5786                      loop is redundant.  */
5787                   use = reduction_phi;
5788                   for (j = 0; j < ncopies; j++)
5789                     {
5790                       edge pr_edge = loop_preheader_edge (loop);
5791                       SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
5792                       use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
5793                     }
5794                 }
5795             }
5796         }
5797
5798       phis.release ();
5799       if (nested_in_vect_loop)
5800         {
5801           if (double_reduc)
5802             loop = outer_loop;
5803           else
5804             continue;
5805         }
5806
5807       phis.create (3);
5808       /* Find the loop-closed-use at the loop exit of the original scalar
5809          result.  (The reduction result is expected to have two immediate uses,
5810          one at the latch block, and one at the loop exit).  For double
5811          reductions we are looking for exit phis of the outer loop.  */
5812       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5813         {
5814           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5815             {
5816               if (!is_gimple_debug (USE_STMT (use_p)))
5817                 phis.safe_push (USE_STMT (use_p));
5818             }
5819           else
5820             {
5821               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5822                 {
5823                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
5824
5825                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5826                     {
5827                       if (!flow_bb_inside_loop_p (loop,
5828                                              gimple_bb (USE_STMT (phi_use_p)))
5829                           && !is_gimple_debug (USE_STMT (phi_use_p)))
5830                         phis.safe_push (USE_STMT (phi_use_p));
5831                     }
5832                 }
5833             }
5834         }
5835
5836       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5837         {
5838           /* Replace the uses:  */
5839           orig_name = PHI_RESULT (exit_phi);
5840           scalar_result = scalar_results[k];
5841           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5842             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5843               SET_USE (use_p, scalar_result);
5844         }
5845
5846       phis.release ();
5847     }
5848 }
5849
5850 /* Return a vector of type VECTYPE that is equal to the vector select
5851    operation "MASK ? VEC : IDENTITY".  Insert the select statements
5852    before GSI.  */
5853
5854 static tree
5855 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
5856                      tree vec, tree identity)
5857 {
5858   tree cond = make_temp_ssa_name (vectype, NULL, "cond");
5859   gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
5860                                           mask, vec, identity);
5861   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5862   return cond;
5863 }
5864
5865 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
5866    order, starting with LHS.  Insert the extraction statements before GSI and
5867    associate the new scalar SSA names with variable SCALAR_DEST.
5868    Return the SSA name for the result.  */
5869
5870 static tree
5871 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
5872                        tree_code code, tree lhs, tree vector_rhs)
5873 {
5874   tree vectype = TREE_TYPE (vector_rhs);
5875   tree scalar_type = TREE_TYPE (vectype);
5876   tree bitsize = TYPE_SIZE (scalar_type);
5877   unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5878   unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
5879
5880   for (unsigned HOST_WIDE_INT bit_offset = 0;
5881        bit_offset < vec_size_in_bits;
5882        bit_offset += element_bitsize)
5883     {
5884       tree bitpos = bitsize_int (bit_offset);
5885       tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
5886                          bitsize, bitpos);
5887
5888       gassign *stmt = gimple_build_assign (scalar_dest, rhs);
5889       rhs = make_ssa_name (scalar_dest, stmt);
5890       gimple_assign_set_lhs (stmt, rhs);
5891       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5892
5893       stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
5894       tree new_name = make_ssa_name (scalar_dest, stmt);
5895       gimple_assign_set_lhs (stmt, new_name);
5896       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5897       lhs = new_name;
5898     }
5899   return lhs;
5900 }
5901
5902 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT is the
5903    statement that sets the live-out value.  REDUC_DEF_STMT is the phi
5904    statement.  CODE is the operation performed by STMT and OPS are
5905    its scalar operands.  REDUC_INDEX is the index of the operand in
5906    OPS that is set by REDUC_DEF_STMT.  REDUC_FN is the function that
5907    implements in-order reduction, or IFN_LAST if we should open-code it.
5908    VECTYPE_IN is the type of the vector input.  MASKS specifies the masks
5909    that should be used to control the operation in a fully-masked loop.  */
5910
5911 static bool
5912 vectorize_fold_left_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
5913                                gimple **vec_stmt, slp_tree slp_node,
5914                                gimple *reduc_def_stmt,
5915                                tree_code code, internal_fn reduc_fn,
5916                                tree ops[3], tree vectype_in,
5917                                int reduc_index, vec_loop_masks *masks)
5918 {
5919   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5920   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5921   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5922   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5923   gimple *new_stmt = NULL;
5924
5925   int ncopies;
5926   if (slp_node)
5927     ncopies = 1;
5928   else
5929     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
5930
5931   gcc_assert (!nested_in_vect_loop_p (loop, stmt));
5932   gcc_assert (ncopies == 1);
5933   gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
5934   gcc_assert (reduc_index == (code == MINUS_EXPR ? 0 : 1));
5935   gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5936               == FOLD_LEFT_REDUCTION);
5937
5938   if (slp_node)
5939     gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
5940                           TYPE_VECTOR_SUBPARTS (vectype_in)));
5941
5942   tree op0 = ops[1 - reduc_index];
5943
5944   int group_size = 1;
5945   gimple *scalar_dest_def;
5946   auto_vec<tree> vec_oprnds0;
5947   if (slp_node)
5948     {
5949       vect_get_vec_defs (op0, NULL_TREE, stmt, &vec_oprnds0, NULL, slp_node);
5950       group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
5951       scalar_dest_def = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5952     }
5953   else
5954     {
5955       tree loop_vec_def0 = vect_get_vec_def_for_operand (op0, stmt);
5956       vec_oprnds0.create (1);
5957       vec_oprnds0.quick_push (loop_vec_def0);
5958       scalar_dest_def = stmt;
5959     }
5960
5961   tree scalar_dest = gimple_assign_lhs (scalar_dest_def);
5962   tree scalar_type = TREE_TYPE (scalar_dest);
5963   tree reduc_var = gimple_phi_result (reduc_def_stmt);
5964
5965   int vec_num = vec_oprnds0.length ();
5966   gcc_assert (vec_num == 1 || slp_node);
5967   tree vec_elem_type = TREE_TYPE (vectype_out);
5968   gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
5969
5970   tree vector_identity = NULL_TREE;
5971   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5972     vector_identity = build_zero_cst (vectype_out);
5973
5974   tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
5975   int i;
5976   tree def0;
5977   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5978     {
5979       tree mask = NULL_TREE;
5980       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5981         mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
5982
5983       /* Handle MINUS by adding the negative.  */
5984       if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
5985         {
5986           tree negated = make_ssa_name (vectype_out);
5987           new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
5988           gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5989           def0 = negated;
5990         }
5991
5992       if (mask)
5993         def0 = merge_with_identity (gsi, mask, vectype_out, def0,
5994                                     vector_identity);
5995
5996       /* On the first iteration the input is simply the scalar phi
5997          result, and for subsequent iterations it is the output of
5998          the preceding operation.  */
5999       if (reduc_fn != IFN_LAST)
6000         {
6001           new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var, def0);
6002           /* For chained SLP reductions the output of the previous reduction
6003              operation serves as the input of the next. For the final statement
6004              the output cannot be a temporary - we reuse the original
6005              scalar destination of the last statement.  */
6006           if (i != vec_num - 1)
6007             {
6008               gimple_set_lhs (new_stmt, scalar_dest_var);
6009               reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
6010               gimple_set_lhs (new_stmt, reduc_var);
6011             }
6012         }
6013       else
6014         {
6015           reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
6016                                              reduc_var, def0);
6017           new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6018           /* Remove the statement, so that we can use the same code paths
6019              as for statements that we've just created.  */
6020           gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6021           gsi_remove (&tmp_gsi, false);
6022         }
6023
6024       if (i == vec_num - 1)
6025         {
6026           gimple_set_lhs (new_stmt, scalar_dest);
6027           vect_finish_replace_stmt (scalar_dest_def, new_stmt);
6028         }
6029       else
6030         vect_finish_stmt_generation (scalar_dest_def, new_stmt, gsi);
6031
6032       if (slp_node)
6033         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6034     }
6035
6036   if (!slp_node)
6037     STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
6038
6039   return true;
6040 }
6041
6042 /* Function is_nonwrapping_integer_induction.
6043
6044    Check if STMT (which is part of loop LOOP) both increments and
6045    does not cause overflow.  */
6046
6047 static bool
6048 is_nonwrapping_integer_induction (gimple *stmt, struct loop *loop)
6049 {
6050   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
6051   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
6052   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
6053   tree lhs_type = TREE_TYPE (gimple_phi_result (stmt));
6054   widest_int ni, max_loop_value, lhs_max;
6055   bool overflow = false;
6056
6057   /* Make sure the loop is integer based.  */
6058   if (TREE_CODE (base) != INTEGER_CST
6059       || TREE_CODE (step) != INTEGER_CST)
6060     return false;
6061
6062   /* Check that the max size of the loop will not wrap.  */
6063
6064   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6065     return true;
6066
6067   if (! max_stmt_executions (loop, &ni))
6068     return false;
6069
6070   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6071                             &overflow);
6072   if (overflow)
6073     return false;
6074
6075   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6076                             TYPE_SIGN (lhs_type), &overflow);
6077   if (overflow)
6078     return false;
6079
6080   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6081           <= TYPE_PRECISION (lhs_type));
6082 }
6083
6084 /* Function vectorizable_reduction.
6085
6086    Check if STMT performs a reduction operation that can be vectorized.
6087    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
6088    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6089    Return FALSE if not a vectorizable STMT, TRUE otherwise.
6090
6091    This function also handles reduction idioms (patterns) that have been
6092    recognized in advance during vect_pattern_recog.  In this case, STMT may be
6093    of this form:
6094      X = pattern_expr (arg0, arg1, ..., X)
6095    and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
6096    sequence that had been detected and replaced by the pattern-stmt (STMT).
6097
6098    This function also handles reduction of condition expressions, for example:
6099      for (int i = 0; i < N; i++)
6100        if (a[i] < value)
6101          last = a[i];
6102    This is handled by vectorising the loop and creating an additional vector
6103    containing the loop indexes for which "a[i] < value" was true.  In the
6104    function epilogue this is reduced to a single max value and then used to
6105    index into the vector of results.
6106
6107    In some cases of reduction patterns, the type of the reduction variable X is
6108    different than the type of the other arguments of STMT.
6109    In such cases, the vectype that is used when transforming STMT into a vector
6110    stmt is different than the vectype that is used to determine the
6111    vectorization factor, because it consists of a different number of elements
6112    than the actual number of elements that are being operated upon in parallel.
6113
6114    For example, consider an accumulation of shorts into an int accumulator.
6115    On some targets it's possible to vectorize this pattern operating on 8
6116    shorts at a time (hence, the vectype for purposes of determining the
6117    vectorization factor should be V8HI); on the other hand, the vectype that
6118    is used to create the vector form is actually V4SI (the type of the result).
6119
6120    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6121    indicates what is the actual level of parallelism (V8HI in the example), so
6122    that the right vectorization factor would be derived.  This vectype
6123    corresponds to the type of arguments to the reduction stmt, and should *NOT*
6124    be used to create the vectorized stmt.  The right vectype for the vectorized
6125    stmt is obtained from the type of the result X:
6126         get_vectype_for_scalar_type (TREE_TYPE (X))
6127
6128    This means that, contrary to "regular" reductions (or "regular" stmts in
6129    general), the following equation:
6130       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
6131    does *NOT* necessarily hold for reduction patterns.  */
6132
6133 bool
6134 vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
6135                         gimple **vec_stmt, slp_tree slp_node,
6136                         slp_instance slp_node_instance,
6137                         stmt_vector_for_cost *cost_vec)
6138 {
6139   tree vec_dest;
6140   tree scalar_dest;
6141   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6142   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6143   tree vectype_in = NULL_TREE;
6144   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6145   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6146   enum tree_code code, orig_code;
6147   internal_fn reduc_fn;
6148   machine_mode vec_mode;
6149   int op_type;
6150   optab optab;
6151   tree new_temp = NULL_TREE;
6152   gimple *def_stmt;
6153   enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type;
6154   gimple *cond_reduc_def_stmt = NULL;
6155   enum tree_code cond_reduc_op_code = ERROR_MARK;
6156   tree scalar_type;
6157   bool is_simple_use;
6158   gimple *orig_stmt;
6159   stmt_vec_info orig_stmt_info = NULL;
6160   int i;
6161   int ncopies;
6162   int epilog_copies;
6163   stmt_vec_info prev_stmt_info, prev_phi_info;
6164   bool single_defuse_cycle = false;
6165   gimple *new_stmt = NULL;
6166   int j;
6167   tree ops[3];
6168   enum vect_def_type dts[3];
6169   bool nested_cycle = false, found_nested_cycle_def = false;
6170   bool double_reduc = false;
6171   basic_block def_bb;
6172   struct loop * def_stmt_loop, *outer_loop = NULL;
6173   tree def_arg;
6174   gimple *def_arg_stmt;
6175   auto_vec<tree> vec_oprnds0;
6176   auto_vec<tree> vec_oprnds1;
6177   auto_vec<tree> vec_oprnds2;
6178   auto_vec<tree> vect_defs;
6179   auto_vec<gimple *> phis;
6180   int vec_num;
6181   tree def0, tem;
6182   bool first_p = true;
6183   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6184   tree cond_reduc_val = NULL_TREE;
6185
6186   /* Make sure it was already recognized as a reduction computation.  */
6187   if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_reduction_def
6188       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_nested_cycle)
6189     return false;
6190
6191   if (nested_in_vect_loop_p (loop, stmt))
6192     {
6193       outer_loop = loop;
6194       loop = loop->inner;
6195       nested_cycle = true;
6196     }
6197
6198   /* In case of reduction chain we switch to the first stmt in the chain, but
6199      we don't update STMT_INFO, since only the last stmt is marked as reduction
6200      and has reduction properties.  */
6201   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6202       && REDUC_GROUP_FIRST_ELEMENT (stmt_info) != stmt)
6203     {
6204       stmt = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
6205       first_p = false;
6206     }
6207
6208   if (gimple_code (stmt) == GIMPLE_PHI)
6209     {
6210       /* Analysis is fully done on the reduction stmt invocation.  */
6211       if (! vec_stmt)
6212         {
6213           if (slp_node)
6214             slp_node_instance->reduc_phis = slp_node;
6215
6216           STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6217           return true;
6218         }
6219
6220       if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6221         /* Leave the scalar phi in place.  Note that checking
6222            STMT_VINFO_VEC_REDUCTION_TYPE (as below) only works
6223            for reductions involving a single statement.  */
6224         return true;
6225
6226       gimple *reduc_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
6227       if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (reduc_stmt)))
6228         reduc_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (reduc_stmt));
6229
6230       if (STMT_VINFO_VEC_REDUCTION_TYPE (vinfo_for_stmt (reduc_stmt))
6231           == EXTRACT_LAST_REDUCTION)
6232         /* Leave the scalar phi in place.  */
6233         return true;
6234
6235       gcc_assert (is_gimple_assign (reduc_stmt));
6236       for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k)
6237         {
6238           tree op = gimple_op (reduc_stmt, k);
6239           if (op == gimple_phi_result (stmt))
6240             continue;
6241           if (k == 1
6242               && gimple_assign_rhs_code (reduc_stmt) == COND_EXPR)
6243             continue;
6244           if (!vectype_in
6245               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6246                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (op)))))
6247             vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op));
6248           break;
6249         }
6250       gcc_assert (vectype_in);
6251
6252       if (slp_node)
6253         ncopies = 1;
6254       else
6255         ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6256
6257       use_operand_p use_p;
6258       gimple *use_stmt;
6259       if (ncopies > 1
6260           && (STMT_VINFO_RELEVANT (vinfo_for_stmt (reduc_stmt))
6261               <= vect_used_only_live)
6262           && single_imm_use (gimple_phi_result (stmt), &use_p, &use_stmt)
6263           && (use_stmt == reduc_stmt
6264               || (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt))
6265                   == reduc_stmt)))
6266         single_defuse_cycle = true;
6267
6268       /* Create the destination vector  */
6269       scalar_dest = gimple_assign_lhs (reduc_stmt);
6270       vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6271
6272       if (slp_node)
6273         /* The size vect_schedule_slp_instance computes is off for us.  */
6274         vec_num = vect_get_num_vectors
6275           (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6276            * SLP_TREE_SCALAR_STMTS (slp_node).length (),
6277            vectype_in);
6278       else
6279         vec_num = 1;
6280
6281       /* Generate the reduction PHIs upfront.  */
6282       prev_phi_info = NULL;
6283       for (j = 0; j < ncopies; j++)
6284         {
6285           if (j == 0 || !single_defuse_cycle)
6286             {
6287               for (i = 0; i < vec_num; i++)
6288                 {
6289                   /* Create the reduction-phi that defines the reduction
6290                      operand.  */
6291                   gimple *new_phi = create_phi_node (vec_dest, loop->header);
6292                   set_vinfo_for_stmt (new_phi,
6293                                       new_stmt_vec_info (new_phi, loop_vinfo));
6294
6295                   if (slp_node)
6296                     SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
6297                   else
6298                     {
6299                       if (j == 0)
6300                         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi;
6301                       else
6302                         STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
6303                       prev_phi_info = vinfo_for_stmt (new_phi);
6304                     }
6305                 }
6306             }
6307         }
6308
6309       return true;
6310     }
6311
6312   /* 1. Is vectorizable reduction?  */
6313   /* Not supportable if the reduction variable is used in the loop, unless
6314      it's a reduction chain.  */
6315   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6316       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6317     return false;
6318
6319   /* Reductions that are not used even in an enclosing outer-loop,
6320      are expected to be "live" (used out of the loop).  */
6321   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6322       && !STMT_VINFO_LIVE_P (stmt_info))
6323     return false;
6324
6325   /* 2. Has this been recognized as a reduction pattern?
6326
6327      Check if STMT represents a pattern that has been recognized
6328      in earlier analysis stages.  For stmts that represent a pattern,
6329      the STMT_VINFO_RELATED_STMT field records the last stmt in
6330      the original sequence that constitutes the pattern.  */
6331
6332   orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
6333   if (orig_stmt)
6334     {
6335       orig_stmt_info = vinfo_for_stmt (orig_stmt);
6336       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6337       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6338     }
6339
6340   /* 3. Check the operands of the operation.  The first operands are defined
6341         inside the loop body. The last operand is the reduction variable,
6342         which is defined by the loop-header-phi.  */
6343
6344   gcc_assert (is_gimple_assign (stmt));
6345
6346   /* Flatten RHS.  */
6347   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
6348     {
6349     case GIMPLE_BINARY_RHS:
6350       code = gimple_assign_rhs_code (stmt);
6351       op_type = TREE_CODE_LENGTH (code);
6352       gcc_assert (op_type == binary_op);
6353       ops[0] = gimple_assign_rhs1 (stmt);
6354       ops[1] = gimple_assign_rhs2 (stmt);
6355       break;
6356
6357     case GIMPLE_TERNARY_RHS:
6358       code = gimple_assign_rhs_code (stmt);
6359       op_type = TREE_CODE_LENGTH (code);
6360       gcc_assert (op_type == ternary_op);
6361       ops[0] = gimple_assign_rhs1 (stmt);
6362       ops[1] = gimple_assign_rhs2 (stmt);
6363       ops[2] = gimple_assign_rhs3 (stmt);
6364       break;
6365
6366     case GIMPLE_UNARY_RHS:
6367       return false;
6368
6369     default:
6370       gcc_unreachable ();
6371     }
6372
6373   if (code == COND_EXPR && slp_node)
6374     return false;
6375
6376   scalar_dest = gimple_assign_lhs (stmt);
6377   scalar_type = TREE_TYPE (scalar_dest);
6378   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6379       && !SCALAR_FLOAT_TYPE_P (scalar_type))
6380     return false;
6381
6382   /* Do not try to vectorize bit-precision reductions.  */
6383   if (!type_has_mode_precision_p (scalar_type))
6384     return false;
6385
6386   /* All uses but the last are expected to be defined in the loop.
6387      The last use is the reduction variable.  In case of nested cycle this
6388      assumption is not true: we use reduc_index to record the index of the
6389      reduction variable.  */
6390   gimple *reduc_def_stmt = NULL;
6391   int reduc_index = -1;
6392   for (i = 0; i < op_type; i++)
6393     {
6394       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
6395       if (i == 0 && code == COND_EXPR)
6396         continue;
6397
6398       is_simple_use = vect_is_simple_use (ops[i], loop_vinfo,
6399                                           &def_stmt, &dts[i], &tem);
6400       dt = dts[i];
6401       gcc_assert (is_simple_use);
6402       if (dt == vect_reduction_def)
6403         {
6404           reduc_def_stmt = def_stmt;
6405           reduc_index = i;
6406           continue;
6407         }
6408       else if (tem)
6409         {
6410           /* To properly compute ncopies we are interested in the widest
6411              input type in case we're looking at a widening accumulation.  */
6412           if (!vectype_in
6413               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6414                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem)))))
6415             vectype_in = tem;
6416         }
6417
6418       if (dt != vect_internal_def
6419           && dt != vect_external_def
6420           && dt != vect_constant_def
6421           && dt != vect_induction_def
6422           && !(dt == vect_nested_cycle && nested_cycle))
6423         return false;
6424
6425       if (dt == vect_nested_cycle)
6426         {
6427           found_nested_cycle_def = true;
6428           reduc_def_stmt = def_stmt;
6429           reduc_index = i;
6430         }
6431
6432       if (i == 1 && code == COND_EXPR)
6433         {
6434           /* Record how value of COND_EXPR is defined.  */
6435           if (dt == vect_constant_def)
6436             {
6437               cond_reduc_dt = dt;
6438               cond_reduc_val = ops[i];
6439             }
6440           if (dt == vect_induction_def
6441               && def_stmt != NULL
6442               && is_nonwrapping_integer_induction (def_stmt, loop))
6443             {
6444               cond_reduc_dt = dt;
6445               cond_reduc_def_stmt = def_stmt;
6446             }
6447         }
6448     }
6449
6450   if (!vectype_in)
6451     vectype_in = vectype_out;
6452
6453   /* When vectorizing a reduction chain w/o SLP the reduction PHI is not
6454      directy used in stmt.  */
6455   if (reduc_index == -1)
6456     {
6457       if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6458         {
6459           if (dump_enabled_p ())
6460             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6461                              "in-order reduction chain without SLP.\n");
6462           return false;
6463         }
6464
6465       if (orig_stmt)
6466         reduc_def_stmt = STMT_VINFO_REDUC_DEF (orig_stmt_info);
6467       else
6468         reduc_def_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
6469     }
6470
6471   if (! reduc_def_stmt || gimple_code (reduc_def_stmt) != GIMPLE_PHI)
6472     return false;
6473
6474   if (!(reduc_index == -1
6475         || dts[reduc_index] == vect_reduction_def
6476         || dts[reduc_index] == vect_nested_cycle
6477         || ((dts[reduc_index] == vect_internal_def
6478              || dts[reduc_index] == vect_external_def
6479              || dts[reduc_index] == vect_constant_def
6480              || dts[reduc_index] == vect_induction_def)
6481             && nested_cycle && found_nested_cycle_def)))
6482     {
6483       /* For pattern recognized stmts, orig_stmt might be a reduction,
6484          but some helper statements for the pattern might not, or
6485          might be COND_EXPRs with reduction uses in the condition.  */
6486       gcc_assert (orig_stmt);
6487       return false;
6488     }
6489
6490   stmt_vec_info reduc_def_info = vinfo_for_stmt (reduc_def_stmt);
6491   enum vect_reduction_type v_reduc_type
6492     = STMT_VINFO_REDUC_TYPE (reduc_def_info);
6493   gimple *tmp = STMT_VINFO_REDUC_DEF (reduc_def_info);
6494
6495   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
6496   /* If we have a condition reduction, see if we can simplify it further.  */
6497   if (v_reduc_type == COND_REDUCTION)
6498     {
6499       /* TODO: We can't yet handle reduction chains, since we need to treat
6500          each COND_EXPR in the chain specially, not just the last one.
6501          E.g. for:
6502
6503             x_1 = PHI <x_3, ...>
6504             x_2 = a_2 ? ... : x_1;
6505             x_3 = a_3 ? ... : x_2;
6506
6507          we're interested in the last element in x_3 for which a_2 || a_3
6508          is true, whereas the current reduction chain handling would
6509          vectorize x_2 as a normal VEC_COND_EXPR and only treat x_3
6510          as a reduction operation.  */
6511       if (reduc_index == -1)
6512         {
6513           if (dump_enabled_p ())
6514             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6515                              "conditional reduction chains not supported\n");
6516           return false;
6517         }
6518
6519       /* vect_is_simple_reduction ensured that operand 2 is the
6520          loop-carried operand.  */
6521       gcc_assert (reduc_index == 2);
6522
6523       /* Loop peeling modifies initial value of reduction PHI, which
6524          makes the reduction stmt to be transformed different to the
6525          original stmt analyzed.  We need to record reduction code for
6526          CONST_COND_REDUCTION type reduction at analyzing stage, thus
6527          it can be used directly at transform stage.  */
6528       if (STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MAX_EXPR
6529           || STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MIN_EXPR)
6530         {
6531           /* Also set the reduction type to CONST_COND_REDUCTION.  */
6532           gcc_assert (cond_reduc_dt == vect_constant_def);
6533           STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = CONST_COND_REDUCTION;
6534         }
6535       else if (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6536                                                vectype_in, OPTIMIZE_FOR_SPEED))
6537         {
6538           if (dump_enabled_p ())
6539             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6540                              "optimizing condition reduction with"
6541                              " FOLD_EXTRACT_LAST.\n");
6542           STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = EXTRACT_LAST_REDUCTION;
6543         }
6544       else if (cond_reduc_dt == vect_induction_def)
6545         {
6546           stmt_vec_info cond_stmt_vinfo = vinfo_for_stmt (cond_reduc_def_stmt);
6547           tree base
6548             = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6549           tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6550
6551           gcc_assert (TREE_CODE (base) == INTEGER_CST
6552                       && TREE_CODE (step) == INTEGER_CST);
6553           cond_reduc_val = NULL_TREE;
6554           /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6555              above base; punt if base is the minimum value of the type for
6556              MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
6557           if (tree_int_cst_sgn (step) == -1)
6558             {
6559               cond_reduc_op_code = MIN_EXPR;
6560               if (tree_int_cst_sgn (base) == -1)
6561                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6562               else if (tree_int_cst_lt (base,
6563                                         TYPE_MAX_VALUE (TREE_TYPE (base))))
6564                 cond_reduc_val
6565                   = int_const_binop (PLUS_EXPR, base, integer_one_node);
6566             }
6567           else
6568             {
6569               cond_reduc_op_code = MAX_EXPR;
6570               if (tree_int_cst_sgn (base) == 1)
6571                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6572               else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6573                                         base))
6574                 cond_reduc_val
6575                   = int_const_binop (MINUS_EXPR, base, integer_one_node);
6576             }
6577           if (cond_reduc_val)
6578             {
6579               if (dump_enabled_p ())
6580                 dump_printf_loc (MSG_NOTE, vect_location,
6581                                  "condition expression based on "
6582                                  "integer induction.\n");
6583               STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6584                 = INTEGER_INDUC_COND_REDUCTION;
6585             }
6586         }
6587       else if (cond_reduc_dt == vect_constant_def)
6588         {
6589           enum vect_def_type cond_initial_dt;
6590           gimple *def_stmt = SSA_NAME_DEF_STMT (ops[reduc_index]);
6591           tree cond_initial_val
6592             = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
6593
6594           gcc_assert (cond_reduc_val != NULL_TREE);
6595           vect_is_simple_use (cond_initial_val, loop_vinfo,
6596                               &def_stmt, &cond_initial_dt);
6597           if (cond_initial_dt == vect_constant_def
6598               && types_compatible_p (TREE_TYPE (cond_initial_val),
6599                                      TREE_TYPE (cond_reduc_val)))
6600             {
6601               tree e = fold_binary (LE_EXPR, boolean_type_node,
6602                                     cond_initial_val, cond_reduc_val);
6603               if (e && (integer_onep (e) || integer_zerop (e)))
6604                 {
6605                   if (dump_enabled_p ())
6606                     dump_printf_loc (MSG_NOTE, vect_location,
6607                                      "condition expression based on "
6608                                      "compile time constant.\n");
6609                   /* Record reduction code at analysis stage.  */
6610                   STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info)
6611                     = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6612                   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6613                     = CONST_COND_REDUCTION;
6614                 }
6615             }
6616         }
6617     }
6618
6619   if (orig_stmt)
6620     gcc_assert (tmp == orig_stmt
6621                 || (REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp))
6622                     == orig_stmt));
6623   else
6624     /* We changed STMT to be the first stmt in reduction chain, hence we
6625        check that in this case the first element in the chain is STMT.  */
6626     gcc_assert (stmt == tmp
6627                 || REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
6628
6629   if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
6630     return false;
6631
6632   if (slp_node)
6633     ncopies = 1;
6634   else
6635     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6636
6637   gcc_assert (ncopies >= 1);
6638
6639   vec_mode = TYPE_MODE (vectype_in);
6640   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6641
6642   if (code == COND_EXPR)
6643     {
6644       /* Only call during the analysis stage, otherwise we'll lose
6645          STMT_VINFO_TYPE.  */
6646       if (!vec_stmt && !vectorizable_condition (stmt, gsi, NULL,
6647                                                 ops[reduc_index], 0, NULL,
6648                                                 cost_vec))
6649         {
6650           if (dump_enabled_p ())
6651             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6652                              "unsupported condition in reduction\n");
6653           return false;
6654         }
6655     }
6656   else
6657     {
6658       /* 4. Supportable by target?  */
6659
6660       if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
6661           || code == LROTATE_EXPR || code == RROTATE_EXPR)
6662         {
6663           /* Shifts and rotates are only supported by vectorizable_shifts,
6664              not vectorizable_reduction.  */
6665           if (dump_enabled_p ())
6666             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6667                              "unsupported shift or rotation.\n");
6668           return false;
6669         }
6670
6671       /* 4.1. check support for the operation in the loop  */
6672       optab = optab_for_tree_code (code, vectype_in, optab_default);
6673       if (!optab)
6674         {
6675           if (dump_enabled_p ())
6676             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6677                              "no optab.\n");
6678
6679           return false;
6680         }
6681
6682       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6683         {
6684           if (dump_enabled_p ())
6685             dump_printf (MSG_NOTE, "op not supported by target.\n");
6686
6687           if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
6688               || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6689             return false;
6690
6691           if (dump_enabled_p ())
6692             dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6693         }
6694
6695       /* Worthwhile without SIMD support?  */
6696       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
6697           && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6698         {
6699           if (dump_enabled_p ())
6700             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6701                              "not worthwhile without SIMD support.\n");
6702
6703           return false;
6704         }
6705     }
6706
6707   /* 4.2. Check support for the epilog operation.
6708
6709           If STMT represents a reduction pattern, then the type of the
6710           reduction variable may be different than the type of the rest
6711           of the arguments.  For example, consider the case of accumulation
6712           of shorts into an int accumulator; The original code:
6713                         S1: int_a = (int) short_a;
6714           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
6715
6716           was replaced with:
6717                         STMT: int_acc = widen_sum <short_a, int_acc>
6718
6719           This means that:
6720           1. The tree-code that is used to create the vector operation in the
6721              epilog code (that reduces the partial results) is not the
6722              tree-code of STMT, but is rather the tree-code of the original
6723              stmt from the pattern that STMT is replacing.  I.e, in the example
6724              above we want to use 'widen_sum' in the loop, but 'plus' in the
6725              epilog.
6726           2. The type (mode) we use to check available target support
6727              for the vector operation to be created in the *epilog*, is
6728              determined by the type of the reduction variable (in the example
6729              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6730              However the type (mode) we use to check available target support
6731              for the vector operation to be created *inside the loop*, is
6732              determined by the type of the other arguments to STMT (in the
6733              example we'd check this: optab_handler (widen_sum_optab,
6734              vect_short_mode)).
6735
6736           This is contrary to "regular" reductions, in which the types of all
6737           the arguments are the same as the type of the reduction variable.
6738           For "regular" reductions we can therefore use the same vector type
6739           (and also the same tree-code) when generating the epilog code and
6740           when generating the code inside the loop.  */
6741
6742   vect_reduction_type reduction_type
6743     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
6744   if (orig_stmt
6745       && (reduction_type == TREE_CODE_REDUCTION
6746           || reduction_type == FOLD_LEFT_REDUCTION))
6747     {
6748       /* This is a reduction pattern: get the vectype from the type of the
6749          reduction variable, and get the tree-code from orig_stmt.  */
6750       orig_code = gimple_assign_rhs_code (orig_stmt);
6751       gcc_assert (vectype_out);
6752       vec_mode = TYPE_MODE (vectype_out);
6753     }
6754   else
6755     {
6756       /* Regular reduction: use the same vectype and tree-code as used for
6757          the vector code inside the loop can be used for the epilog code. */
6758       orig_code = code;
6759
6760       if (code == MINUS_EXPR)
6761         orig_code = PLUS_EXPR;
6762
6763       /* For simple condition reductions, replace with the actual expression
6764          we want to base our reduction around.  */
6765       if (reduction_type == CONST_COND_REDUCTION)
6766         {
6767           orig_code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
6768           gcc_assert (orig_code == MAX_EXPR || orig_code == MIN_EXPR);
6769         }
6770       else if (reduction_type == INTEGER_INDUC_COND_REDUCTION)
6771         orig_code = cond_reduc_op_code;
6772     }
6773
6774   if (nested_cycle)
6775     {
6776       def_bb = gimple_bb (reduc_def_stmt);
6777       def_stmt_loop = def_bb->loop_father;
6778       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
6779                                        loop_preheader_edge (def_stmt_loop));
6780       if (TREE_CODE (def_arg) == SSA_NAME
6781           && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
6782           && gimple_code (def_arg_stmt) == GIMPLE_PHI
6783           && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
6784           && vinfo_for_stmt (def_arg_stmt)
6785           && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
6786               == vect_double_reduction_def)
6787         double_reduc = true;
6788     }
6789
6790   reduc_fn = IFN_LAST;
6791
6792   if (reduction_type == TREE_CODE_REDUCTION
6793       || reduction_type == FOLD_LEFT_REDUCTION
6794       || reduction_type == INTEGER_INDUC_COND_REDUCTION
6795       || reduction_type == CONST_COND_REDUCTION)
6796     {
6797       if (reduction_type == FOLD_LEFT_REDUCTION
6798           ? fold_left_reduction_fn (orig_code, &reduc_fn)
6799           : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6800         {
6801           if (reduc_fn != IFN_LAST
6802               && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6803                                                   OPTIMIZE_FOR_SPEED))
6804             {
6805               if (dump_enabled_p ())
6806                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6807                                  "reduc op not supported by target.\n");
6808
6809               reduc_fn = IFN_LAST;
6810             }
6811         }
6812       else
6813         {
6814           if (!nested_cycle || double_reduc)
6815             {
6816               if (dump_enabled_p ())
6817                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6818                                  "no reduc code for scalar code.\n");
6819
6820               return false;
6821             }
6822         }
6823     }
6824   else if (reduction_type == COND_REDUCTION)
6825     {
6826       int scalar_precision
6827         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6828       cr_index_scalar_type = make_unsigned_type (scalar_precision);
6829       cr_index_vector_type = build_vector_type (cr_index_scalar_type,
6830                                                 nunits_out);
6831
6832       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
6833                                           OPTIMIZE_FOR_SPEED))
6834         reduc_fn = IFN_REDUC_MAX;
6835     }
6836
6837   if (reduction_type != EXTRACT_LAST_REDUCTION
6838       && reduc_fn == IFN_LAST
6839       && !nunits_out.is_constant ())
6840     {
6841       if (dump_enabled_p ())
6842         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6843                          "missing target support for reduction on"
6844                          " variable-length vectors.\n");
6845       return false;
6846     }
6847
6848   if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
6849       && ncopies > 1)
6850     {
6851       if (dump_enabled_p ())
6852         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6853                          "multiple types in double reduction or condition "
6854                          "reduction.\n");
6855       return false;
6856     }
6857
6858   /* For SLP reductions, see if there is a neutral value we can use.  */
6859   tree neutral_op = NULL_TREE;
6860   if (slp_node)
6861     neutral_op = neutral_op_for_slp_reduction
6862                    (slp_node_instance->reduc_phis, code,
6863                     REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL);
6864
6865   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
6866     {
6867       /* We can't support in-order reductions of code such as this:
6868
6869            for (int i = 0; i < n1; ++i)
6870              for (int j = 0; j < n2; ++j)
6871                l += a[j];
6872
6873          since GCC effectively transforms the loop when vectorizing:
6874
6875            for (int i = 0; i < n1 / VF; ++i)
6876              for (int j = 0; j < n2; ++j)
6877                for (int k = 0; k < VF; ++k)
6878                  l += a[j];
6879
6880          which is a reassociation of the original operation.  */
6881       if (dump_enabled_p ())
6882         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6883                          "in-order double reduction not supported.\n");
6884
6885       return false;
6886     }
6887
6888   if (reduction_type == FOLD_LEFT_REDUCTION
6889       && slp_node
6890       && !REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
6891     {
6892       /* We cannot use in-order reductions in this case because there is
6893          an implicit reassociation of the operations involved.  */
6894       if (dump_enabled_p ())
6895         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6896                          "in-order unchained SLP reductions not supported.\n");
6897       return false;
6898     }
6899
6900   /* For double reductions, and for SLP reductions with a neutral value,
6901      we construct a variable-length initial vector by loading a vector
6902      full of the neutral value and then shift-and-inserting the start
6903      values into the low-numbered elements.  */
6904   if ((double_reduc || neutral_op)
6905       && !nunits_out.is_constant ()
6906       && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
6907                                           vectype_out, OPTIMIZE_FOR_SPEED))
6908     {
6909       if (dump_enabled_p ())
6910         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6911                          "reduction on variable-length vectors requires"
6912                          " target support for a vector-shift-and-insert"
6913                          " operation.\n");
6914       return false;
6915     }
6916
6917   /* Check extra constraints for variable-length unchained SLP reductions.  */
6918   if (STMT_SLP_TYPE (stmt_info)
6919       && !REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
6920       && !nunits_out.is_constant ())
6921     {
6922       /* We checked above that we could build the initial vector when
6923          there's a neutral element value.  Check here for the case in
6924          which each SLP statement has its own initial value and in which
6925          that value needs to be repeated for every instance of the
6926          statement within the initial vector.  */
6927       unsigned int group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6928       scalar_mode elt_mode = SCALAR_TYPE_MODE (TREE_TYPE (vectype_out));
6929       if (!neutral_op
6930           && !can_duplicate_and_interleave_p (group_size, elt_mode))
6931         {
6932           if (dump_enabled_p ())
6933             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6934                              "unsupported form of SLP reduction for"
6935                              " variable-length vectors: cannot build"
6936                              " initial vector.\n");
6937           return false;
6938         }
6939       /* The epilogue code relies on the number of elements being a multiple
6940          of the group size.  The duplicate-and-interleave approach to setting
6941          up the the initial vector does too.  */
6942       if (!multiple_p (nunits_out, group_size))
6943         {
6944           if (dump_enabled_p ())
6945             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6946                              "unsupported form of SLP reduction for"
6947                              " variable-length vectors: the vector size"
6948                              " is not a multiple of the number of results.\n");
6949           return false;
6950         }
6951     }
6952
6953   /* In case of widenning multiplication by a constant, we update the type
6954      of the constant to be the type of the other operand.  We check that the
6955      constant fits the type in the pattern recognition pass.  */
6956   if (code == DOT_PROD_EXPR
6957       && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
6958     {
6959       if (TREE_CODE (ops[0]) == INTEGER_CST)
6960         ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
6961       else if (TREE_CODE (ops[1]) == INTEGER_CST)
6962         ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
6963       else
6964         {
6965           if (dump_enabled_p ())
6966             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6967                              "invalid types in dot-prod\n");
6968
6969           return false;
6970         }
6971     }
6972
6973   if (reduction_type == COND_REDUCTION)
6974     {
6975       widest_int ni;
6976
6977       if (! max_loop_iterations (loop, &ni))
6978         {
6979           if (dump_enabled_p ())
6980             dump_printf_loc (MSG_NOTE, vect_location,
6981                              "loop count not known, cannot create cond "
6982                              "reduction.\n");
6983           return false;
6984         }
6985       /* Convert backedges to iterations.  */
6986       ni += 1;
6987
6988       /* The additional index will be the same type as the condition.  Check
6989          that the loop can fit into this less one (because we'll use up the
6990          zero slot for when there are no matches).  */
6991       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
6992       if (wi::geu_p (ni, wi::to_widest (max_index)))
6993         {
6994           if (dump_enabled_p ())
6995             dump_printf_loc (MSG_NOTE, vect_location,
6996                              "loop size is greater than data size.\n");
6997           return false;
6998         }
6999     }
7000
7001   /* In case the vectorization factor (VF) is bigger than the number
7002      of elements that we can fit in a vectype (nunits), we have to generate
7003      more than one vector stmt - i.e - we need to "unroll" the
7004      vector stmt by a factor VF/nunits.  For more details see documentation
7005      in vectorizable_operation.  */
7006
7007   /* If the reduction is used in an outer loop we need to generate
7008      VF intermediate results, like so (e.g. for ncopies=2):
7009         r0 = phi (init, r0)
7010         r1 = phi (init, r1)
7011         r0 = x0 + r0;
7012         r1 = x1 + r1;
7013     (i.e. we generate VF results in 2 registers).
7014     In this case we have a separate def-use cycle for each copy, and therefore
7015     for each copy we get the vector def for the reduction variable from the
7016     respective phi node created for this copy.
7017
7018     Otherwise (the reduction is unused in the loop nest), we can combine
7019     together intermediate results, like so (e.g. for ncopies=2):
7020         r = phi (init, r)
7021         r = x0 + r;
7022         r = x1 + r;
7023    (i.e. we generate VF/2 results in a single register).
7024    In this case for each copy we get the vector def for the reduction variable
7025    from the vectorized reduction operation generated in the previous iteration.
7026
7027    This only works when we see both the reduction PHI and its only consumer
7028    in vectorizable_reduction and there are no intermediate stmts
7029    participating.  */
7030   use_operand_p use_p;
7031   gimple *use_stmt;
7032   if (ncopies > 1
7033       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
7034       && single_imm_use (gimple_phi_result (reduc_def_stmt), &use_p, &use_stmt)
7035       && (use_stmt == stmt
7036           || STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt)) == stmt))
7037     {
7038       single_defuse_cycle = true;
7039       epilog_copies = 1;
7040     }
7041   else
7042     epilog_copies = ncopies;
7043
7044   /* If the reduction stmt is one of the patterns that have lane
7045      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
7046   if ((ncopies > 1
7047        && ! single_defuse_cycle)
7048       && (code == DOT_PROD_EXPR
7049           || code == WIDEN_SUM_EXPR
7050           || code == SAD_EXPR))
7051     {
7052       if (dump_enabled_p ())
7053         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7054                          "multi def-use cycle not possible for lane-reducing "
7055                          "reduction operation\n");
7056       return false;
7057     }
7058
7059   if (slp_node)
7060     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7061   else
7062     vec_num = 1;
7063
7064   internal_fn cond_fn = get_conditional_internal_fn (code);
7065   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7066
7067   if (!vec_stmt) /* transformation not required.  */
7068     {
7069       if (first_p)
7070         vect_model_reduction_cost (stmt_info, reduc_fn, ncopies, cost_vec);
7071       if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
7072         {
7073           if (reduction_type != FOLD_LEFT_REDUCTION
7074               && (cond_fn == IFN_LAST
7075                   || !direct_internal_fn_supported_p (cond_fn, vectype_in,
7076                                                       OPTIMIZE_FOR_SPEED)))
7077             {
7078               if (dump_enabled_p ())
7079                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7080                                  "can't use a fully-masked loop because no"
7081                                  " conditional operation is available.\n");
7082               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7083             }
7084           else if (reduc_index == -1)
7085             {
7086               if (dump_enabled_p ())
7087                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7088                                  "can't use a fully-masked loop for chained"
7089                                  " reductions.\n");
7090               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7091             }
7092           else
7093             vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
7094                                    vectype_in);
7095         }
7096       if (dump_enabled_p ()
7097           && reduction_type == FOLD_LEFT_REDUCTION)
7098         dump_printf_loc (MSG_NOTE, vect_location,
7099                          "using an in-order (fold-left) reduction.\n");
7100       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7101       return true;
7102     }
7103
7104   /* Transform.  */
7105
7106   if (dump_enabled_p ())
7107     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7108
7109   /* FORNOW: Multiple types are not supported for condition.  */
7110   if (code == COND_EXPR)
7111     gcc_assert (ncopies == 1);
7112
7113   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7114
7115   if (reduction_type == FOLD_LEFT_REDUCTION)
7116     return vectorize_fold_left_reduction
7117       (stmt, gsi, vec_stmt, slp_node, reduc_def_stmt, code,
7118        reduc_fn, ops, vectype_in, reduc_index, masks);
7119
7120   if (reduction_type == EXTRACT_LAST_REDUCTION)
7121     {
7122       gcc_assert (!slp_node);
7123       return vectorizable_condition (stmt, gsi, vec_stmt,
7124                                      NULL, reduc_index, NULL, NULL);
7125     }
7126
7127   /* Create the destination vector  */
7128   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7129
7130   prev_stmt_info = NULL;
7131   prev_phi_info = NULL;
7132   if (!slp_node)
7133     {
7134       vec_oprnds0.create (1);
7135       vec_oprnds1.create (1);
7136       if (op_type == ternary_op)
7137         vec_oprnds2.create (1);
7138     }
7139
7140   phis.create (vec_num);
7141   vect_defs.create (vec_num);
7142   if (!slp_node)
7143     vect_defs.quick_push (NULL_TREE);
7144
7145   if (slp_node)
7146     phis.splice (SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis));
7147   else
7148     phis.quick_push (STMT_VINFO_VEC_STMT (vinfo_for_stmt (reduc_def_stmt)));
7149
7150   for (j = 0; j < ncopies; j++)
7151     {
7152       if (code == COND_EXPR)
7153         {
7154           gcc_assert (!slp_node);
7155           vectorizable_condition (stmt, gsi, vec_stmt,
7156                                   PHI_RESULT (phis[0]),
7157                                   reduc_index, NULL, NULL);
7158           /* Multiple types are not supported for condition.  */
7159           break;
7160         }
7161
7162       /* Handle uses.  */
7163       if (j == 0)
7164         {
7165           if (slp_node)
7166             {
7167               /* Get vec defs for all the operands except the reduction index,
7168                  ensuring the ordering of the ops in the vector is kept.  */
7169               auto_vec<tree, 3> slp_ops;
7170               auto_vec<vec<tree>, 3> vec_defs;
7171
7172               slp_ops.quick_push (ops[0]);
7173               slp_ops.quick_push (ops[1]);
7174               if (op_type == ternary_op)
7175                 slp_ops.quick_push (ops[2]);
7176
7177               vect_get_slp_defs (slp_ops, slp_node, &vec_defs);
7178
7179               vec_oprnds0.safe_splice (vec_defs[0]);
7180               vec_defs[0].release ();
7181               vec_oprnds1.safe_splice (vec_defs[1]);
7182               vec_defs[1].release ();
7183               if (op_type == ternary_op)
7184                 {
7185                   vec_oprnds2.safe_splice (vec_defs[2]);
7186                   vec_defs[2].release ();
7187                 }
7188             }
7189           else
7190             {
7191               vec_oprnds0.quick_push
7192                 (vect_get_vec_def_for_operand (ops[0], stmt));
7193               vec_oprnds1.quick_push
7194                 (vect_get_vec_def_for_operand (ops[1], stmt));
7195               if (op_type == ternary_op)
7196                 vec_oprnds2.quick_push
7197                   (vect_get_vec_def_for_operand (ops[2], stmt));
7198             }
7199         }
7200       else
7201         {
7202           if (!slp_node)
7203             {
7204               gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
7205
7206               if (single_defuse_cycle && reduc_index == 0)
7207                 vec_oprnds0[0] = gimple_get_lhs (new_stmt);
7208               else
7209                 vec_oprnds0[0]
7210                   = vect_get_vec_def_for_stmt_copy (dts[0], vec_oprnds0[0]);
7211               if (single_defuse_cycle && reduc_index == 1)
7212                 vec_oprnds1[0] = gimple_get_lhs (new_stmt);
7213               else
7214                 vec_oprnds1[0]
7215                   = vect_get_vec_def_for_stmt_copy (dts[1], vec_oprnds1[0]);
7216               if (op_type == ternary_op)
7217                 {
7218                   if (single_defuse_cycle && reduc_index == 2)
7219                     vec_oprnds2[0] = gimple_get_lhs (new_stmt);
7220                   else
7221                     vec_oprnds2[0]
7222                       = vect_get_vec_def_for_stmt_copy (dts[2], vec_oprnds2[0]);
7223                 }
7224             }
7225         }
7226
7227       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7228         {
7229           tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7230           if (masked_loop_p)
7231             {
7232               /* Make sure that the reduction accumulator is vop[0].  */
7233               if (reduc_index == 1)
7234                 {
7235                   gcc_assert (commutative_tree_code (code));
7236                   std::swap (vop[0], vop[1]);
7237                 }
7238               tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7239                                               vectype_in, i * ncopies + j);
7240               gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
7241                                                         vop[0], vop[1],
7242                                                         vop[0]);
7243               new_temp = make_ssa_name (vec_dest, call);
7244               gimple_call_set_lhs (call, new_temp);
7245               gimple_call_set_nothrow (call, true);
7246               new_stmt = call;
7247             }
7248           else
7249             {
7250               if (op_type == ternary_op)
7251                 vop[2] = vec_oprnds2[i];
7252
7253               new_temp = make_ssa_name (vec_dest, new_stmt);
7254               new_stmt = gimple_build_assign (new_temp, code,
7255                                               vop[0], vop[1], vop[2]);
7256             }
7257           vect_finish_stmt_generation (stmt, new_stmt, gsi);
7258
7259           if (slp_node)
7260             {
7261               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7262               vect_defs.quick_push (new_temp);
7263             }
7264           else
7265             vect_defs[0] = new_temp;
7266         }
7267
7268       if (slp_node)
7269         continue;
7270
7271       if (j == 0)
7272         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
7273       else
7274         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
7275
7276       prev_stmt_info = vinfo_for_stmt (new_stmt);
7277     }
7278
7279   /* Finalize the reduction-phi (set its arguments) and create the
7280      epilog reduction code.  */
7281   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
7282     vect_defs[0] = gimple_get_lhs (*vec_stmt);
7283
7284   vect_create_epilog_for_reduction (vect_defs, stmt, reduc_def_stmt,
7285                                     epilog_copies, reduc_fn, phis,
7286                                     double_reduc, slp_node, slp_node_instance,
7287                                     cond_reduc_val, cond_reduc_op_code,
7288                                     neutral_op);
7289
7290   return true;
7291 }
7292
7293 /* Function vect_min_worthwhile_factor.
7294
7295    For a loop where we could vectorize the operation indicated by CODE,
7296    return the minimum vectorization factor that makes it worthwhile
7297    to use generic vectors.  */
7298 static unsigned int
7299 vect_min_worthwhile_factor (enum tree_code code)
7300 {
7301   switch (code)
7302     {
7303     case PLUS_EXPR:
7304     case MINUS_EXPR:
7305     case NEGATE_EXPR:
7306       return 4;
7307
7308     case BIT_AND_EXPR:
7309     case BIT_IOR_EXPR:
7310     case BIT_XOR_EXPR:
7311     case BIT_NOT_EXPR:
7312       return 2;
7313
7314     default:
7315       return INT_MAX;
7316     }
7317 }
7318
7319 /* Return true if VINFO indicates we are doing loop vectorization and if
7320    it is worth decomposing CODE operations into scalar operations for
7321    that loop's vectorization factor.  */
7322
7323 bool
7324 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
7325 {
7326   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7327   unsigned HOST_WIDE_INT value;
7328   return (loop_vinfo
7329           && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7330           && value >= vect_min_worthwhile_factor (code));
7331 }
7332
7333 /* Function vectorizable_induction
7334
7335    Check if PHI performs an induction computation that can be vectorized.
7336    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7337    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7338    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
7339
7340 bool
7341 vectorizable_induction (gimple *phi,
7342                         gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7343                         gimple **vec_stmt, slp_tree slp_node,
7344                         stmt_vector_for_cost *cost_vec)
7345 {
7346   stmt_vec_info stmt_info = vinfo_for_stmt (phi);
7347   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7348   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7349   unsigned ncopies;
7350   bool nested_in_vect_loop = false;
7351   struct loop *iv_loop;
7352   tree vec_def;
7353   edge pe = loop_preheader_edge (loop);
7354   basic_block new_bb;
7355   tree new_vec, vec_init, vec_step, t;
7356   tree new_name;
7357   gimple *new_stmt;
7358   gphi *induction_phi;
7359   tree induc_def, vec_dest;
7360   tree init_expr, step_expr;
7361   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7362   unsigned i;
7363   tree expr;
7364   gimple_seq stmts;
7365   imm_use_iterator imm_iter;
7366   use_operand_p use_p;
7367   gimple *exit_phi;
7368   edge latch_e;
7369   tree loop_arg;
7370   gimple_stmt_iterator si;
7371   basic_block bb = gimple_bb (phi);
7372
7373   if (gimple_code (phi) != GIMPLE_PHI)
7374     return false;
7375
7376   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7377     return false;
7378
7379   /* Make sure it was recognized as induction computation.  */
7380   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7381     return false;
7382
7383   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7384   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7385
7386   if (slp_node)
7387     ncopies = 1;
7388   else
7389     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7390   gcc_assert (ncopies >= 1);
7391
7392   /* FORNOW. These restrictions should be relaxed.  */
7393   if (nested_in_vect_loop_p (loop, phi))
7394     {
7395       imm_use_iterator imm_iter;
7396       use_operand_p use_p;
7397       gimple *exit_phi;
7398       edge latch_e;
7399       tree loop_arg;
7400
7401       if (ncopies > 1)
7402         {
7403           if (dump_enabled_p ())
7404             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7405                              "multiple types in nested loop.\n");
7406           return false;
7407         }
7408
7409       /* FORNOW: outer loop induction with SLP not supported.  */
7410       if (STMT_SLP_TYPE (stmt_info))
7411         return false;
7412
7413       exit_phi = NULL;
7414       latch_e = loop_latch_edge (loop->inner);
7415       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7416       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7417         {
7418           gimple *use_stmt = USE_STMT (use_p);
7419           if (is_gimple_debug (use_stmt))
7420             continue;
7421
7422           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7423             {
7424               exit_phi = use_stmt;
7425               break;
7426             }
7427         }
7428       if (exit_phi)
7429         {
7430           stmt_vec_info exit_phi_vinfo  = vinfo_for_stmt (exit_phi);
7431           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7432                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7433             {
7434               if (dump_enabled_p ())
7435                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7436                                  "inner-loop induction only used outside "
7437                                  "of the outer vectorized loop.\n");
7438               return false;
7439             }
7440         }
7441
7442       nested_in_vect_loop = true;
7443       iv_loop = loop->inner;
7444     }
7445   else
7446     iv_loop = loop;
7447   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7448
7449   if (slp_node && !nunits.is_constant ())
7450     {
7451       /* The current SLP code creates the initial value element-by-element.  */
7452       if (dump_enabled_p ())
7453         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7454                          "SLP induction not supported for variable-length"
7455                          " vectors.\n");
7456       return false;
7457     }
7458
7459   if (!vec_stmt) /* transformation not required.  */
7460     {
7461       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
7462       DUMP_VECT_SCOPE ("vectorizable_induction");
7463       vect_model_induction_cost (stmt_info, ncopies, cost_vec);
7464       return true;
7465     }
7466
7467   /* Transform.  */
7468
7469   /* Compute a vector variable, initialized with the first VF values of
7470      the induction variable.  E.g., for an iv with IV_PHI='X' and
7471      evolution S, for a vector of 4 units, we want to compute:
7472      [X, X + S, X + 2*S, X + 3*S].  */
7473
7474   if (dump_enabled_p ())
7475     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
7476
7477   latch_e = loop_latch_edge (iv_loop);
7478   loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7479
7480   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
7481   gcc_assert (step_expr != NULL_TREE);
7482
7483   pe = loop_preheader_edge (iv_loop);
7484   init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
7485                                      loop_preheader_edge (iv_loop));
7486
7487   stmts = NULL;
7488   if (!nested_in_vect_loop)
7489     {
7490       /* Convert the initial value to the desired type.  */
7491       tree new_type = TREE_TYPE (vectype);
7492       init_expr = gimple_convert (&stmts, new_type, init_expr);
7493
7494       /* If we are using the loop mask to "peel" for alignment then we need
7495          to adjust the start value here.  */
7496       tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
7497       if (skip_niters != NULL_TREE)
7498         {
7499           if (FLOAT_TYPE_P (vectype))
7500             skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
7501                                         skip_niters);
7502           else
7503             skip_niters = gimple_convert (&stmts, new_type, skip_niters);
7504           tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
7505                                          skip_niters, step_expr);
7506           init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
7507                                     init_expr, skip_step);
7508         }
7509     }
7510
7511   /* Convert the step to the desired type.  */
7512   step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
7513
7514   if (stmts)
7515     {
7516       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7517       gcc_assert (!new_bb);
7518     }
7519
7520   /* Find the first insertion point in the BB.  */
7521   si = gsi_after_labels (bb);
7522
7523   /* For SLP induction we have to generate several IVs as for example
7524      with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
7525      [i + 2*S, i + 3*S, i + 3*S, i + 3*S].  The step is the same uniform
7526      [VF*S, VF*S, VF*S, VF*S] for all.  */
7527   if (slp_node)
7528     {
7529       /* Enforced above.  */
7530       unsigned int const_nunits = nunits.to_constant ();
7531
7532       /* Generate [VF*S, VF*S, ... ].  */
7533       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7534         {
7535           expr = build_int_cst (integer_type_node, vf);
7536           expr = fold_convert (TREE_TYPE (step_expr), expr);
7537         }
7538       else
7539         expr = build_int_cst (TREE_TYPE (step_expr), vf);
7540       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7541                               expr, step_expr);
7542       if (! CONSTANT_CLASS_P (new_name))
7543         new_name = vect_init_vector (phi, new_name,
7544                                      TREE_TYPE (step_expr), NULL);
7545       new_vec = build_vector_from_val (vectype, new_name);
7546       vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7547
7548       /* Now generate the IVs.  */
7549       unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7550       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7551       unsigned elts = const_nunits * nvects;
7552       unsigned nivs = least_common_multiple (group_size,
7553                                              const_nunits) / const_nunits;
7554       gcc_assert (elts % group_size == 0);
7555       tree elt = init_expr;
7556       unsigned ivn;
7557       for (ivn = 0; ivn < nivs; ++ivn)
7558         {
7559           tree_vector_builder elts (vectype, const_nunits, 1);
7560           stmts = NULL;
7561           for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
7562             {
7563               if (ivn*const_nunits + eltn >= group_size
7564                   && (ivn * const_nunits + eltn) % group_size == 0)
7565                 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
7566                                     elt, step_expr);
7567               elts.quick_push (elt);
7568             }
7569           vec_init = gimple_build_vector (&stmts, &elts);
7570           if (stmts)
7571             {
7572               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7573               gcc_assert (!new_bb);
7574             }
7575
7576           /* Create the induction-phi that defines the induction-operand.  */
7577           vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7578           induction_phi = create_phi_node (vec_dest, iv_loop->header);
7579           set_vinfo_for_stmt (induction_phi,
7580                               new_stmt_vec_info (induction_phi, loop_vinfo));
7581           induc_def = PHI_RESULT (induction_phi);
7582
7583           /* Create the iv update inside the loop  */
7584           vec_def = make_ssa_name (vec_dest);
7585           new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7586           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7587           set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
7588
7589           /* Set the arguments of the phi node:  */
7590           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7591           add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7592                        UNKNOWN_LOCATION);
7593
7594           SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
7595         }
7596
7597       /* Re-use IVs when we can.  */
7598       if (ivn < nvects)
7599         {
7600           unsigned vfp
7601             = least_common_multiple (group_size, const_nunits) / group_size;
7602           /* Generate [VF'*S, VF'*S, ... ].  */
7603           if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7604             {
7605               expr = build_int_cst (integer_type_node, vfp);
7606               expr = fold_convert (TREE_TYPE (step_expr), expr);
7607             }
7608           else
7609             expr = build_int_cst (TREE_TYPE (step_expr), vfp);
7610           new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7611                                   expr, step_expr);
7612           if (! CONSTANT_CLASS_P (new_name))
7613             new_name = vect_init_vector (phi, new_name,
7614                                          TREE_TYPE (step_expr), NULL);
7615           new_vec = build_vector_from_val (vectype, new_name);
7616           vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7617           for (; ivn < nvects; ++ivn)
7618             {
7619               gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
7620               tree def;
7621               if (gimple_code (iv) == GIMPLE_PHI)
7622                 def = gimple_phi_result (iv);
7623               else
7624                 def = gimple_assign_lhs (iv);
7625               new_stmt = gimple_build_assign (make_ssa_name (vectype),
7626                                               PLUS_EXPR,
7627                                               def, vec_step);
7628               if (gimple_code (iv) == GIMPLE_PHI)
7629                 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7630               else
7631                 {
7632                   gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
7633                   gsi_insert_after (&tgsi, new_stmt, GSI_CONTINUE_LINKING);
7634                 }
7635               set_vinfo_for_stmt (new_stmt,
7636                                   new_stmt_vec_info (new_stmt, loop_vinfo));
7637               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7638             }
7639         }
7640
7641       return true;
7642     }
7643
7644   /* Create the vector that holds the initial_value of the induction.  */
7645   if (nested_in_vect_loop)
7646     {
7647       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
7648          been created during vectorization of previous stmts.  We obtain it
7649          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
7650       vec_init = vect_get_vec_def_for_operand (init_expr, phi);
7651       /* If the initial value is not of proper type, convert it.  */
7652       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
7653         {
7654           new_stmt
7655             = gimple_build_assign (vect_get_new_ssa_name (vectype,
7656                                                           vect_simple_var,
7657                                                           "vec_iv_"),
7658                                    VIEW_CONVERT_EXPR,
7659                                    build1 (VIEW_CONVERT_EXPR, vectype,
7660                                            vec_init));
7661           vec_init = gimple_assign_lhs (new_stmt);
7662           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
7663                                                  new_stmt);
7664           gcc_assert (!new_bb);
7665           set_vinfo_for_stmt (new_stmt,
7666                               new_stmt_vec_info (new_stmt, loop_vinfo));
7667         }
7668     }
7669   else
7670     {
7671       /* iv_loop is the loop to be vectorized. Create:
7672          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
7673       stmts = NULL;
7674       new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
7675
7676       unsigned HOST_WIDE_INT const_nunits;
7677       if (nunits.is_constant (&const_nunits))
7678         {
7679           tree_vector_builder elts (vectype, const_nunits, 1);
7680           elts.quick_push (new_name);
7681           for (i = 1; i < const_nunits; i++)
7682             {
7683               /* Create: new_name_i = new_name + step_expr  */
7684               new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
7685                                        new_name, step_expr);
7686               elts.quick_push (new_name);
7687             }
7688           /* Create a vector from [new_name_0, new_name_1, ...,
7689              new_name_nunits-1]  */
7690           vec_init = gimple_build_vector (&stmts, &elts);
7691         }
7692       else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
7693         /* Build the initial value directly from a VEC_SERIES_EXPR.  */
7694         vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, vectype,
7695                                  new_name, step_expr);
7696       else
7697         {
7698           /* Build:
7699                 [base, base, base, ...]
7700                 + (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
7701           gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
7702           gcc_assert (flag_associative_math);
7703           tree index = build_index_vector (vectype, 0, 1);
7704           tree base_vec = gimple_build_vector_from_val (&stmts, vectype,
7705                                                         new_name);
7706           tree step_vec = gimple_build_vector_from_val (&stmts, vectype,
7707                                                         step_expr);
7708           vec_init = gimple_build (&stmts, FLOAT_EXPR, vectype, index);
7709           vec_init = gimple_build (&stmts, MULT_EXPR, vectype,
7710                                    vec_init, step_vec);
7711           vec_init = gimple_build (&stmts, PLUS_EXPR, vectype,
7712                                    vec_init, base_vec);
7713         }
7714
7715       if (stmts)
7716         {
7717           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7718           gcc_assert (!new_bb);
7719         }
7720     }
7721
7722
7723   /* Create the vector that holds the step of the induction.  */
7724   if (nested_in_vect_loop)
7725     /* iv_loop is nested in the loop to be vectorized. Generate:
7726        vec_step = [S, S, S, S]  */
7727     new_name = step_expr;
7728   else
7729     {
7730       /* iv_loop is the loop to be vectorized. Generate:
7731           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
7732       gimple_seq seq = NULL;
7733       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7734         {
7735           expr = build_int_cst (integer_type_node, vf);
7736           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7737         }
7738       else
7739         expr = build_int_cst (TREE_TYPE (step_expr), vf);
7740       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7741                                expr, step_expr);
7742       if (seq)
7743         {
7744           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7745           gcc_assert (!new_bb);
7746         }
7747     }
7748
7749   t = unshare_expr (new_name);
7750   gcc_assert (CONSTANT_CLASS_P (new_name)
7751               || TREE_CODE (new_name) == SSA_NAME);
7752   new_vec = build_vector_from_val (vectype, t);
7753   vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7754
7755
7756   /* Create the following def-use cycle:
7757      loop prolog:
7758          vec_init = ...
7759          vec_step = ...
7760      loop:
7761          vec_iv = PHI <vec_init, vec_loop>
7762          ...
7763          STMT
7764          ...
7765          vec_loop = vec_iv + vec_step;  */
7766
7767   /* Create the induction-phi that defines the induction-operand.  */
7768   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7769   induction_phi = create_phi_node (vec_dest, iv_loop->header);
7770   set_vinfo_for_stmt (induction_phi,
7771                       new_stmt_vec_info (induction_phi, loop_vinfo));
7772   induc_def = PHI_RESULT (induction_phi);
7773
7774   /* Create the iv update inside the loop  */
7775   vec_def = make_ssa_name (vec_dest);
7776   new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7777   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7778   set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
7779
7780   /* Set the arguments of the phi node:  */
7781   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7782   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7783                UNKNOWN_LOCATION);
7784
7785   STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi;
7786
7787   /* In case that vectorization factor (VF) is bigger than the number
7788      of elements that we can fit in a vectype (nunits), we have to generate
7789      more than one vector stmt - i.e - we need to "unroll" the
7790      vector stmt by a factor VF/nunits.  For more details see documentation
7791      in vectorizable_operation.  */
7792
7793   if (ncopies > 1)
7794     {
7795       gimple_seq seq = NULL;
7796       stmt_vec_info prev_stmt_vinfo;
7797       /* FORNOW. This restriction should be relaxed.  */
7798       gcc_assert (!nested_in_vect_loop);
7799
7800       /* Create the vector that holds the step of the induction.  */
7801       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7802         {
7803           expr = build_int_cst (integer_type_node, nunits);
7804           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7805         }
7806       else
7807         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
7808       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7809                                expr, step_expr);
7810       if (seq)
7811         {
7812           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7813           gcc_assert (!new_bb);
7814         }
7815
7816       t = unshare_expr (new_name);
7817       gcc_assert (CONSTANT_CLASS_P (new_name)
7818                   || TREE_CODE (new_name) == SSA_NAME);
7819       new_vec = build_vector_from_val (vectype, t);
7820       vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7821
7822       vec_def = induc_def;
7823       prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
7824       for (i = 1; i < ncopies; i++)
7825         {
7826           /* vec_i = vec_prev + vec_step  */
7827           new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
7828                                           vec_def, vec_step);
7829           vec_def = make_ssa_name (vec_dest, new_stmt);
7830           gimple_assign_set_lhs (new_stmt, vec_def);
7831
7832           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7833           set_vinfo_for_stmt (new_stmt,
7834                               new_stmt_vec_info (new_stmt, loop_vinfo));
7835           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
7836           prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
7837         }
7838     }
7839
7840   if (nested_in_vect_loop)
7841     {
7842       /* Find the loop-closed exit-phi of the induction, and record
7843          the final vector of induction results:  */
7844       exit_phi = NULL;
7845       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7846         {
7847           gimple *use_stmt = USE_STMT (use_p);
7848           if (is_gimple_debug (use_stmt))
7849             continue;
7850
7851           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
7852             {
7853               exit_phi = use_stmt;
7854               break;
7855             }
7856         }
7857       if (exit_phi)
7858         {
7859           stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
7860           /* FORNOW. Currently not supporting the case that an inner-loop induction
7861              is not used in the outer-loop (i.e. only outside the outer-loop).  */
7862           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
7863                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
7864
7865           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
7866           if (dump_enabled_p ())
7867             {
7868               dump_printf_loc (MSG_NOTE, vect_location,
7869                                "vector of inductions after inner-loop:");
7870               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
7871             }
7872         }
7873     }
7874
7875
7876   if (dump_enabled_p ())
7877     {
7878       dump_printf_loc (MSG_NOTE, vect_location,
7879                        "transform induction: created def-use cycle: ");
7880       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
7881       dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
7882                         SSA_NAME_DEF_STMT (vec_def), 0);
7883     }
7884
7885   return true;
7886 }
7887
7888 /* Function vectorizable_live_operation.
7889
7890    STMT computes a value that is used outside the loop.  Check if
7891    it can be supported.  */
7892
7893 bool
7894 vectorizable_live_operation (gimple *stmt,
7895                              gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7896                              slp_tree slp_node, int slp_index,
7897                              gimple **vec_stmt,
7898                              stmt_vector_for_cost *)
7899 {
7900   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
7901   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7902   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7903   imm_use_iterator imm_iter;
7904   tree lhs, lhs_type, bitsize, vec_bitsize;
7905   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7906   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7907   int ncopies;
7908   gimple *use_stmt;
7909   auto_vec<tree> vec_oprnds;
7910   int vec_entry = 0;
7911   poly_uint64 vec_index = 0;
7912
7913   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
7914
7915   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
7916     return false;
7917
7918   /* FORNOW.  CHECKME.  */
7919   if (nested_in_vect_loop_p (loop, stmt))
7920     return false;
7921
7922   /* If STMT is not relevant and it is a simple assignment and its inputs are
7923      invariant then it can remain in place, unvectorized.  The original last
7924      scalar value that it computes will be used.  */
7925   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7926     {
7927       gcc_assert (is_simple_and_all_uses_invariant (stmt, loop_vinfo));
7928       if (dump_enabled_p ())
7929         dump_printf_loc (MSG_NOTE, vect_location,
7930                          "statement is simple and uses invariant.  Leaving in "
7931                          "place.\n");
7932       return true;
7933     }
7934
7935   if (slp_node)
7936     ncopies = 1;
7937   else
7938     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7939
7940   if (slp_node)
7941     {
7942       gcc_assert (slp_index >= 0);
7943
7944       int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7945       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7946
7947       /* Get the last occurrence of the scalar index from the concatenation of
7948          all the slp vectors. Calculate which slp vector it is and the index
7949          within.  */
7950       poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
7951
7952       /* Calculate which vector contains the result, and which lane of
7953          that vector we need.  */
7954       if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
7955         {
7956           if (dump_enabled_p ())
7957             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7958                              "Cannot determine which vector holds the"
7959                              " final result.\n");
7960           return false;
7961         }
7962     }
7963
7964   if (!vec_stmt)
7965     {
7966       /* No transformation required.  */
7967       if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
7968         {
7969           if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
7970                                                OPTIMIZE_FOR_SPEED))
7971             {
7972               if (dump_enabled_p ())
7973                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7974                                  "can't use a fully-masked loop because "
7975                                  "the target doesn't support extract last "
7976                                  "reduction.\n");
7977               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7978             }
7979           else if (slp_node)
7980             {
7981               if (dump_enabled_p ())
7982                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7983                                  "can't use a fully-masked loop because an "
7984                                  "SLP statement is live after the loop.\n");
7985               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7986             }
7987           else if (ncopies > 1)
7988             {
7989               if (dump_enabled_p ())
7990                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7991                                  "can't use a fully-masked loop because"
7992                                  " ncopies is greater than 1.\n");
7993               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7994             }
7995           else
7996             {
7997               gcc_assert (ncopies == 1 && !slp_node);
7998               vect_record_loop_mask (loop_vinfo,
7999                                      &LOOP_VINFO_MASKS (loop_vinfo),
8000                                      1, vectype);
8001             }
8002         }
8003       return true;
8004     }
8005
8006   /* If stmt has a related stmt, then use that for getting the lhs.  */
8007   if (is_pattern_stmt_p (stmt_info))
8008     stmt = STMT_VINFO_RELATED_STMT (stmt_info);
8009
8010   lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
8011         : gimple_get_lhs (stmt);
8012   lhs_type = TREE_TYPE (lhs);
8013
8014   bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype)
8015              ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype)))
8016              : TYPE_SIZE (TREE_TYPE (vectype)));
8017   vec_bitsize = TYPE_SIZE (vectype);
8018
8019   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
8020   tree vec_lhs, bitstart;
8021   if (slp_node)
8022     {
8023       gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8024
8025       /* Get the correct slp vectorized stmt.  */
8026       gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry];
8027       if (gphi *phi = dyn_cast <gphi *> (vec_stmt))
8028         vec_lhs = gimple_phi_result (phi);
8029       else
8030         vec_lhs = gimple_get_lhs (vec_stmt);
8031
8032       /* Get entry to use.  */
8033       bitstart = bitsize_int (vec_index);
8034       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
8035     }
8036   else
8037     {
8038       enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
8039       vec_lhs = vect_get_vec_def_for_operand_1 (stmt, dt);
8040       gcc_checking_assert (ncopies == 1
8041                            || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8042
8043       /* For multiple copies, get the last copy.  */
8044       for (int i = 1; i < ncopies; ++i)
8045         vec_lhs = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type,
8046                                                   vec_lhs);
8047
8048       /* Get the last lane in the vector.  */
8049       bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
8050     }
8051
8052   gimple_seq stmts = NULL;
8053   tree new_tree;
8054   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8055     {
8056       /* Emit:
8057
8058            SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
8059
8060          where VEC_LHS is the vectorized live-out result and MASK is
8061          the loop mask for the final iteration.  */
8062       gcc_assert (ncopies == 1 && !slp_node);
8063       tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
8064       tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
8065                                       1, vectype, 0);
8066       tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST,
8067                                       scalar_type, mask, vec_lhs);
8068
8069       /* Convert the extracted vector element to the required scalar type.  */
8070       new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
8071     }
8072   else
8073     {
8074       tree bftype = TREE_TYPE (vectype);
8075       if (VECTOR_BOOLEAN_TYPE_P (vectype))
8076         bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8077       new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs, bitsize, bitstart);
8078       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8079                                        &stmts, true, NULL_TREE);
8080     }
8081
8082   if (stmts)
8083     gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts);
8084
8085   /* Replace use of lhs with newly computed result.  If the use stmt is a
8086      single arg PHI, just replace all uses of PHI result.  It's necessary
8087      because lcssa PHI defining lhs may be before newly inserted stmt.  */
8088   use_operand_p use_p;
8089   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8090     if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
8091         && !is_gimple_debug (use_stmt))
8092     {
8093       if (gimple_code (use_stmt) == GIMPLE_PHI
8094           && gimple_phi_num_args (use_stmt) == 1)
8095         {
8096           replace_uses_by (gimple_phi_result (use_stmt), new_tree);
8097         }
8098       else
8099         {
8100           FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8101             SET_USE (use_p, new_tree);
8102         }
8103       update_stmt (use_stmt);
8104     }
8105
8106   return true;
8107 }
8108
8109 /* Kill any debug uses outside LOOP of SSA names defined in STMT.  */
8110
8111 static void
8112 vect_loop_kill_debug_uses (struct loop *loop, gimple *stmt)
8113 {
8114   ssa_op_iter op_iter;
8115   imm_use_iterator imm_iter;
8116   def_operand_p def_p;
8117   gimple *ustmt;
8118
8119   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
8120     {
8121       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
8122         {
8123           basic_block bb;
8124
8125           if (!is_gimple_debug (ustmt))
8126             continue;
8127
8128           bb = gimple_bb (ustmt);
8129
8130           if (!flow_bb_inside_loop_p (loop, bb))
8131             {
8132               if (gimple_debug_bind_p (ustmt))
8133                 {
8134                   if (dump_enabled_p ())
8135                     dump_printf_loc (MSG_NOTE, vect_location,
8136                                      "killing debug use\n");
8137
8138                   gimple_debug_bind_reset_value (ustmt);
8139                   update_stmt (ustmt);
8140                 }
8141               else
8142                 gcc_unreachable ();
8143             }
8144         }
8145     }
8146 }
8147
8148 /* Given loop represented by LOOP_VINFO, return true if computation of
8149    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
8150    otherwise.  */
8151
8152 static bool
8153 loop_niters_no_overflow (loop_vec_info loop_vinfo)
8154 {
8155   /* Constant case.  */
8156   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8157     {
8158       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
8159       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
8160
8161       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
8162       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
8163       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
8164         return true;
8165     }
8166
8167   widest_int max;
8168   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8169   /* Check the upper bound of loop niters.  */
8170   if (get_max_loop_iterations (loop, &max))
8171     {
8172       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
8173       signop sgn = TYPE_SIGN (type);
8174       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
8175       if (max < type_max)
8176         return true;
8177     }
8178   return false;
8179 }
8180
8181 /* Return a mask type with half the number of elements as TYPE.  */
8182
8183 tree
8184 vect_halve_mask_nunits (tree type)
8185 {
8186   poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (type), 2);
8187   return build_truth_vector_type (nunits, current_vector_size);
8188 }
8189
8190 /* Return a mask type with twice as many elements as TYPE.  */
8191
8192 tree
8193 vect_double_mask_nunits (tree type)
8194 {
8195   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (type) * 2;
8196   return build_truth_vector_type (nunits, current_vector_size);
8197 }
8198
8199 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8200    contain a sequence of NVECTORS masks that each control a vector of type
8201    VECTYPE.  */
8202
8203 void
8204 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
8205                        unsigned int nvectors, tree vectype)
8206 {
8207   gcc_assert (nvectors != 0);
8208   if (masks->length () < nvectors)
8209     masks->safe_grow_cleared (nvectors);
8210   rgroup_masks *rgm = &(*masks)[nvectors - 1];
8211   /* The number of scalars per iteration and the number of vectors are
8212      both compile-time constants.  */
8213   unsigned int nscalars_per_iter
8214     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8215                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8216   if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8217     {
8218       rgm->max_nscalars_per_iter = nscalars_per_iter;
8219       rgm->mask_type = build_same_sized_truth_vector_type (vectype);
8220     }
8221 }
8222
8223 /* Given a complete set of masks MASKS, extract mask number INDEX
8224    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8225    where 0 <= INDEX < NVECTORS.  Insert any set-up statements before GSI.
8226
8227    See the comment above vec_loop_masks for more details about the mask
8228    arrangement.  */
8229
8230 tree
8231 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
8232                     unsigned int nvectors, tree vectype, unsigned int index)
8233 {
8234   rgroup_masks *rgm = &(*masks)[nvectors - 1];
8235   tree mask_type = rgm->mask_type;
8236
8237   /* Populate the rgroup's mask array, if this is the first time we've
8238      used it.  */
8239   if (rgm->masks.is_empty ())
8240     {
8241       rgm->masks.safe_grow_cleared (nvectors);
8242       for (unsigned int i = 0; i < nvectors; ++i)
8243         {
8244           tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
8245           /* Provide a dummy definition until the real one is available.  */
8246           SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
8247           rgm->masks[i] = mask;
8248         }
8249     }
8250
8251   tree mask = rgm->masks[index];
8252   if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
8253                 TYPE_VECTOR_SUBPARTS (vectype)))
8254     {
8255       /* A loop mask for data type X can be reused for data type Y
8256          if X has N times more elements than Y and if Y's elements
8257          are N times bigger than X's.  In this case each sequence
8258          of N elements in the loop mask will be all-zero or all-one.
8259          We can then view-convert the mask so that each sequence of
8260          N elements is replaced by a single element.  */
8261       gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
8262                               TYPE_VECTOR_SUBPARTS (vectype)));
8263       gimple_seq seq = NULL;
8264       mask_type = build_same_sized_truth_vector_type (vectype);
8265       mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
8266       if (seq)
8267         gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
8268     }
8269   return mask;
8270 }
8271
8272 /* Scale profiling counters by estimation for LOOP which is vectorized
8273    by factor VF.  */
8274
8275 static void
8276 scale_profile_for_vect_loop (struct loop *loop, unsigned vf)
8277 {
8278   edge preheader = loop_preheader_edge (loop);
8279   /* Reduce loop iterations by the vectorization factor.  */
8280   gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
8281   profile_count freq_h = loop->header->count, freq_e = preheader->count ();
8282
8283   if (freq_h.nonzero_p ())
8284     {
8285       profile_probability p;
8286
8287       /* Avoid dropping loop body profile counter to 0 because of zero count
8288          in loop's preheader.  */
8289       if (!(freq_e == profile_count::zero ()))
8290         freq_e = freq_e.force_nonzero ();
8291       p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
8292       scale_loop_frequencies (loop, p);
8293     }
8294
8295   edge exit_e = single_exit (loop);
8296   exit_e->probability = profile_probability::always ()
8297                                  .apply_scale (1, new_est_niter + 1);
8298
8299   edge exit_l = single_pred_edge (loop->latch);
8300   profile_probability prob = exit_l->probability;
8301   exit_l->probability = exit_e->probability.invert ();
8302   if (prob.initialized_p () && exit_l->probability.initialized_p ())
8303     scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
8304 }
8305
8306 /* Vectorize STMT if relevant, inserting any new instructions before GSI.
8307    When vectorizing STMT as a store, set *SEEN_STORE to its stmt_vec_info.
8308    *SLP_SCHEDULE is a running record of whether we have called
8309    vect_schedule_slp.  */
8310
8311 static void
8312 vect_transform_loop_stmt (loop_vec_info loop_vinfo, gimple *stmt,
8313                           gimple_stmt_iterator *gsi,
8314                           stmt_vec_info *seen_store, bool *slp_scheduled)
8315 {
8316   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8317   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8318   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
8319   if (!stmt_info)
8320     return;
8321
8322   if (dump_enabled_p ())
8323     {
8324       dump_printf_loc (MSG_NOTE, vect_location,
8325                        "------>vectorizing statement: ");
8326       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
8327     }
8328
8329   if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8330     vect_loop_kill_debug_uses (loop, stmt);
8331
8332   if (!STMT_VINFO_RELEVANT_P (stmt_info)
8333       && !STMT_VINFO_LIVE_P (stmt_info))
8334     return;
8335
8336   if (STMT_VINFO_VECTYPE (stmt_info))
8337     {
8338       poly_uint64 nunits
8339         = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8340       if (!STMT_SLP_TYPE (stmt_info)
8341           && maybe_ne (nunits, vf)
8342           && dump_enabled_p ())
8343         /* For SLP VF is set according to unrolling factor, and not
8344            to vector size, hence for SLP this print is not valid.  */
8345         dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8346     }
8347
8348   /* SLP.  Schedule all the SLP instances when the first SLP stmt is
8349      reached.  */
8350   if (slp_vect_type slptype = STMT_SLP_TYPE (stmt_info))
8351     {
8352
8353       if (!*slp_scheduled)
8354         {
8355           *slp_scheduled = true;
8356
8357           DUMP_VECT_SCOPE ("scheduling SLP instances");
8358
8359           vect_schedule_slp (loop_vinfo);
8360         }
8361
8362       /* Hybrid SLP stmts must be vectorized in addition to SLP.  */
8363       if (slptype == pure_slp)
8364         return;
8365     }
8366
8367   if (dump_enabled_p ())
8368     dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
8369
8370   bool grouped_store = false;
8371   if (vect_transform_stmt (stmt, gsi, &grouped_store, NULL, NULL))
8372     *seen_store = stmt_info;
8373 }
8374
8375 /* Function vect_transform_loop.
8376
8377    The analysis phase has determined that the loop is vectorizable.
8378    Vectorize the loop - created vectorized stmts to replace the scalar
8379    stmts in the loop, and update the loop exit condition.
8380    Returns scalar epilogue loop if any.  */
8381
8382 struct loop *
8383 vect_transform_loop (loop_vec_info loop_vinfo)
8384 {
8385   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8386   struct loop *epilogue = NULL;
8387   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8388   int nbbs = loop->num_nodes;
8389   int i;
8390   tree niters_vector = NULL_TREE;
8391   tree step_vector = NULL_TREE;
8392   tree niters_vector_mult_vf = NULL_TREE;
8393   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8394   unsigned int lowest_vf = constant_lower_bound (vf);
8395   bool slp_scheduled = false;
8396   gimple *stmt;
8397   bool check_profitability = false;
8398   unsigned int th;
8399
8400   DUMP_VECT_SCOPE ("vec_transform_loop");
8401
8402   loop_vinfo->shared->check_datarefs ();
8403
8404   /* Use the more conservative vectorization threshold.  If the number
8405      of iterations is constant assume the cost check has been performed
8406      by our caller.  If the threshold makes all loops profitable that
8407      run at least the (estimated) vectorization factor number of times
8408      checking is pointless, too.  */
8409   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
8410   if (th >= vect_vf_for_cost (loop_vinfo)
8411       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8412     {
8413       if (dump_enabled_p ())
8414         dump_printf_loc (MSG_NOTE, vect_location,
8415                          "Profitability threshold is %d loop iterations.\n",
8416                          th);
8417       check_profitability = true;
8418     }
8419
8420   /* Make sure there exists a single-predecessor exit bb.  Do this before
8421      versioning.   */
8422   edge e = single_exit (loop);
8423   if (! single_pred_p (e->dest))
8424     {
8425       split_loop_exit_edge (e);
8426       if (dump_enabled_p ())
8427         dump_printf (MSG_NOTE, "split exit edge\n");
8428     }
8429
8430   /* Version the loop first, if required, so the profitability check
8431      comes first.  */
8432
8433   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
8434     {
8435       poly_uint64 versioning_threshold
8436         = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
8437       if (check_profitability
8438           && ordered_p (poly_uint64 (th), versioning_threshold))
8439         {
8440           versioning_threshold = ordered_max (poly_uint64 (th),
8441                                               versioning_threshold);
8442           check_profitability = false;
8443         }
8444       vect_loop_versioning (loop_vinfo, th, check_profitability,
8445                             versioning_threshold);
8446       check_profitability = false;
8447     }
8448
8449   /* Make sure there exists a single-predecessor exit bb also on the
8450      scalar loop copy.  Do this after versioning but before peeling
8451      so CFG structure is fine for both scalar and if-converted loop
8452      to make slpeel_duplicate_current_defs_from_edges face matched
8453      loop closed PHI nodes on the exit.  */
8454   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8455     {
8456       e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
8457       if (! single_pred_p (e->dest))
8458         {
8459           split_loop_exit_edge (e);
8460           if (dump_enabled_p ())
8461             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
8462         }
8463     }
8464
8465   tree niters = vect_build_loop_niters (loop_vinfo);
8466   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
8467   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
8468   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
8469   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
8470                               &step_vector, &niters_vector_mult_vf, th,
8471                               check_profitability, niters_no_overflow);
8472
8473   if (niters_vector == NULL_TREE)
8474     {
8475       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8476           && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8477           && known_eq (lowest_vf, vf))
8478         {
8479           niters_vector
8480             = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
8481                              LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
8482           step_vector = build_one_cst (TREE_TYPE (niters));
8483         }
8484       else
8485         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
8486                                      &step_vector, niters_no_overflow);
8487     }
8488
8489   /* 1) Make sure the loop header has exactly two entries
8490      2) Make sure we have a preheader basic block.  */
8491
8492   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
8493
8494   split_edge (loop_preheader_edge (loop));
8495
8496   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8497       && vect_use_loop_mask_for_alignment_p (loop_vinfo))
8498     /* This will deal with any possible peeling.  */
8499     vect_prepare_for_masked_peels (loop_vinfo);
8500
8501   /* FORNOW: the vectorizer supports only loops which body consist
8502      of one basic block (header + empty latch). When the vectorizer will
8503      support more involved loop forms, the order by which the BBs are
8504      traversed need to be reconsidered.  */
8505
8506   for (i = 0; i < nbbs; i++)
8507     {
8508       basic_block bb = bbs[i];
8509       stmt_vec_info stmt_info;
8510
8511       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
8512            gsi_next (&si))
8513         {
8514           gphi *phi = si.phi ();
8515           if (dump_enabled_p ())
8516             {
8517               dump_printf_loc (MSG_NOTE, vect_location,
8518                                "------>vectorizing phi: ");
8519               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
8520             }
8521           stmt_info = vinfo_for_stmt (phi);
8522           if (!stmt_info)
8523             continue;
8524
8525           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8526             vect_loop_kill_debug_uses (loop, phi);
8527
8528           if (!STMT_VINFO_RELEVANT_P (stmt_info)
8529               && !STMT_VINFO_LIVE_P (stmt_info))
8530             continue;
8531
8532           if (STMT_VINFO_VECTYPE (stmt_info)
8533               && (maybe_ne
8534                   (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
8535               && dump_enabled_p ())
8536             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8537
8538           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
8539                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8540                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
8541               && ! PURE_SLP_STMT (stmt_info))
8542             {
8543               if (dump_enabled_p ())
8544                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
8545               vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
8546             }
8547         }
8548
8549       for (gimple_stmt_iterator si = gsi_start_bb (bb);
8550            !gsi_end_p (si);)
8551         {
8552           stmt = gsi_stmt (si);
8553           /* During vectorization remove existing clobber stmts.  */
8554           if (gimple_clobber_p (stmt))
8555             {
8556               unlink_stmt_vdef (stmt);
8557               gsi_remove (&si, true);
8558               release_defs (stmt);
8559             }
8560           else
8561             {
8562               stmt_info = vinfo_for_stmt (stmt);
8563
8564               /* vector stmts created in the outer-loop during vectorization of
8565                  stmts in an inner-loop may not have a stmt_info, and do not
8566                  need to be vectorized.  */
8567               stmt_vec_info seen_store = NULL;
8568               if (stmt_info)
8569                 {
8570                   if (STMT_VINFO_IN_PATTERN_P (stmt_info))
8571                     {
8572                       gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
8573                       for (gimple_stmt_iterator subsi = gsi_start (def_seq);
8574                            !gsi_end_p (subsi); gsi_next (&subsi))
8575                         vect_transform_loop_stmt (loop_vinfo,
8576                                                   gsi_stmt (subsi), &si,
8577                                                   &seen_store,
8578                                                   &slp_scheduled);
8579                       gimple *pat_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
8580                       vect_transform_loop_stmt (loop_vinfo, pat_stmt, &si,
8581                                                 &seen_store, &slp_scheduled);
8582                     }
8583                   vect_transform_loop_stmt (loop_vinfo, stmt, &si,
8584                                             &seen_store, &slp_scheduled);
8585                 }
8586               if (seen_store)
8587                 {
8588                   if (STMT_VINFO_GROUPED_ACCESS (seen_store))
8589                     {
8590                       /* Interleaving.  If IS_STORE is TRUE, the
8591                          vectorization of the interleaving chain was
8592                          completed - free all the stores in the chain.  */
8593                       gsi_next (&si);
8594                       vect_remove_stores (DR_GROUP_FIRST_ELEMENT (seen_store));
8595                     }
8596                   else
8597                     {
8598                       /* Free the attached stmt_vec_info and remove the
8599                          stmt.  */
8600                       free_stmt_vec_info (stmt);
8601                       unlink_stmt_vdef (stmt);
8602                       gsi_remove (&si, true);
8603                       release_defs (stmt);
8604                     }
8605                 }
8606               else
8607                 gsi_next (&si);
8608             }
8609         }
8610
8611       /* Stub out scalar statements that must not survive vectorization.
8612          Doing this here helps with grouped statements, or statements that
8613          are involved in patterns.  */
8614       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
8615            !gsi_end_p (gsi); gsi_next (&gsi))
8616         {
8617           gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
8618           if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
8619             {
8620               tree lhs = gimple_get_lhs (call);
8621               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8622                 {
8623                   tree zero = build_zero_cst (TREE_TYPE (lhs));
8624                   gimple *new_stmt = gimple_build_assign (lhs, zero);
8625                   gsi_replace (&gsi, new_stmt, true);
8626                 }
8627             }
8628         }
8629     }                           /* BBs in loop */
8630
8631   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
8632      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
8633   if (integer_onep (step_vector))
8634     niters_no_overflow = true;
8635   vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
8636                            niters_vector_mult_vf, !niters_no_overflow);
8637
8638   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
8639   scale_profile_for_vect_loop (loop, assumed_vf);
8640
8641   /* True if the final iteration might not handle a full vector's
8642      worth of scalar iterations.  */
8643   bool final_iter_may_be_partial = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8644   /* The minimum number of iterations performed by the epilogue.  This
8645      is 1 when peeling for gaps because we always need a final scalar
8646      iteration.  */
8647   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
8648   /* +1 to convert latch counts to loop iteration counts,
8649      -min_epilogue_iters to remove iterations that cannot be performed
8650        by the vector code.  */
8651   int bias_for_lowest = 1 - min_epilogue_iters;
8652   int bias_for_assumed = bias_for_lowest;
8653   int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
8654   if (alignment_npeels && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8655     {
8656       /* When the amount of peeling is known at compile time, the first
8657          iteration will have exactly alignment_npeels active elements.
8658          In the worst case it will have at least one.  */
8659       int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
8660       bias_for_lowest += lowest_vf - min_first_active;
8661       bias_for_assumed += assumed_vf - min_first_active;
8662     }
8663   /* In these calculations the "- 1" converts loop iteration counts
8664      back to latch counts.  */
8665   if (loop->any_upper_bound)
8666     loop->nb_iterations_upper_bound
8667       = (final_iter_may_be_partial
8668          ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
8669                           lowest_vf) - 1
8670          : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
8671                            lowest_vf) - 1);
8672   if (loop->any_likely_upper_bound)
8673     loop->nb_iterations_likely_upper_bound
8674       = (final_iter_may_be_partial
8675          ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
8676                           + bias_for_lowest, lowest_vf) - 1
8677          : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
8678                            + bias_for_lowest, lowest_vf) - 1);
8679   if (loop->any_estimate)
8680     loop->nb_iterations_estimate
8681       = (final_iter_may_be_partial
8682          ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
8683                           assumed_vf) - 1
8684          : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
8685                            assumed_vf) - 1);
8686
8687   if (dump_enabled_p ())
8688     {
8689       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8690         {
8691           dump_printf_loc (MSG_NOTE, vect_location,
8692                            "LOOP VECTORIZED\n");
8693           if (loop->inner)
8694             dump_printf_loc (MSG_NOTE, vect_location,
8695                              "OUTER LOOP VECTORIZED\n");
8696           dump_printf (MSG_NOTE, "\n");
8697         }
8698       else
8699         {
8700           dump_printf_loc (MSG_NOTE, vect_location,
8701                            "LOOP EPILOGUE VECTORIZED (VS=");
8702           dump_dec (MSG_NOTE, current_vector_size);
8703           dump_printf (MSG_NOTE, ")\n");
8704         }
8705     }
8706
8707   /* Free SLP instances here because otherwise stmt reference counting
8708      won't work.  */
8709   slp_instance instance;
8710   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
8711     vect_free_slp_instance (instance);
8712   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
8713   /* Clear-up safelen field since its value is invalid after vectorization
8714      since vectorized loop can have loop-carried dependencies.  */
8715   loop->safelen = 0;
8716
8717   /* Don't vectorize epilogue for epilogue.  */
8718   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8719     epilogue = NULL;
8720
8721   if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
8722     epilogue = NULL;
8723
8724   if (epilogue)
8725     {
8726       auto_vector_sizes vector_sizes;
8727       targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
8728       unsigned int next_size = 0;
8729
8730       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8731           && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0
8732           && known_eq (vf, lowest_vf))
8733         {
8734           unsigned int eiters
8735             = (LOOP_VINFO_INT_NITERS (loop_vinfo)
8736                - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
8737           eiters = eiters % lowest_vf;
8738           epilogue->nb_iterations_upper_bound = eiters - 1;
8739
8740           unsigned int ratio;
8741           while (next_size < vector_sizes.length ()
8742                  && !(constant_multiple_p (current_vector_size,
8743                                            vector_sizes[next_size], &ratio)
8744                       && eiters >= lowest_vf / ratio))
8745             next_size += 1;
8746         }
8747       else
8748         while (next_size < vector_sizes.length ()
8749                && maybe_lt (current_vector_size, vector_sizes[next_size]))
8750           next_size += 1;
8751
8752       if (next_size == vector_sizes.length ())
8753         epilogue = NULL;
8754     }
8755
8756   if (epilogue)
8757     {
8758       epilogue->force_vectorize = loop->force_vectorize;
8759       epilogue->safelen = loop->safelen;
8760       epilogue->dont_vectorize = false;
8761
8762       /* We may need to if-convert epilogue to vectorize it.  */
8763       if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8764         tree_if_conversion (epilogue);
8765     }
8766
8767   return epilogue;
8768 }
8769
8770 /* The code below is trying to perform simple optimization - revert
8771    if-conversion for masked stores, i.e. if the mask of a store is zero
8772    do not perform it and all stored value producers also if possible.
8773    For example,
8774      for (i=0; i<n; i++)
8775        if (c[i])
8776         {
8777           p1[i] += 1;
8778           p2[i] = p3[i] +2;
8779         }
8780    this transformation will produce the following semi-hammock:
8781
8782    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
8783      {
8784        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
8785        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
8786        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
8787        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
8788        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
8789        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
8790      }
8791 */
8792
8793 void
8794 optimize_mask_stores (struct loop *loop)
8795 {
8796   basic_block *bbs = get_loop_body (loop);
8797   unsigned nbbs = loop->num_nodes;
8798   unsigned i;
8799   basic_block bb;
8800   struct loop *bb_loop;
8801   gimple_stmt_iterator gsi;
8802   gimple *stmt;
8803   auto_vec<gimple *> worklist;
8804
8805   vect_location = find_loop_location (loop);
8806   /* Pick up all masked stores in loop if any.  */
8807   for (i = 0; i < nbbs; i++)
8808     {
8809       bb = bbs[i];
8810       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
8811            gsi_next (&gsi))
8812         {
8813           stmt = gsi_stmt (gsi);
8814           if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
8815             worklist.safe_push (stmt);
8816         }
8817     }
8818
8819   free (bbs);
8820   if (worklist.is_empty ())
8821     return;
8822
8823   /* Loop has masked stores.  */
8824   while (!worklist.is_empty ())
8825     {
8826       gimple *last, *last_store;
8827       edge e, efalse;
8828       tree mask;
8829       basic_block store_bb, join_bb;
8830       gimple_stmt_iterator gsi_to;
8831       tree vdef, new_vdef;
8832       gphi *phi;
8833       tree vectype;
8834       tree zero;
8835
8836       last = worklist.pop ();
8837       mask = gimple_call_arg (last, 2);
8838       bb = gimple_bb (last);
8839       /* Create then_bb and if-then structure in CFG, then_bb belongs to
8840          the same loop as if_bb.  It could be different to LOOP when two
8841          level loop-nest is vectorized and mask_store belongs to the inner
8842          one.  */
8843       e = split_block (bb, last);
8844       bb_loop = bb->loop_father;
8845       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
8846       join_bb = e->dest;
8847       store_bb = create_empty_bb (bb);
8848       add_bb_to_loop (store_bb, bb_loop);
8849       e->flags = EDGE_TRUE_VALUE;
8850       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
8851       /* Put STORE_BB to likely part.  */
8852       efalse->probability = profile_probability::unlikely ();
8853       store_bb->count = efalse->count ();
8854       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
8855       if (dom_info_available_p (CDI_DOMINATORS))
8856         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
8857       if (dump_enabled_p ())
8858         dump_printf_loc (MSG_NOTE, vect_location,
8859                          "Create new block %d to sink mask stores.",
8860                          store_bb->index);
8861       /* Create vector comparison with boolean result.  */
8862       vectype = TREE_TYPE (mask);
8863       zero = build_zero_cst (vectype);
8864       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
8865       gsi = gsi_last_bb (bb);
8866       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
8867       /* Create new PHI node for vdef of the last masked store:
8868          .MEM_2 = VDEF <.MEM_1>
8869          will be converted to
8870          .MEM.3 = VDEF <.MEM_1>
8871          and new PHI node will be created in join bb
8872          .MEM_2 = PHI <.MEM_1, .MEM_3>
8873       */
8874       vdef = gimple_vdef (last);
8875       new_vdef = make_ssa_name (gimple_vop (cfun), last);
8876       gimple_set_vdef (last, new_vdef);
8877       phi = create_phi_node (vdef, join_bb);
8878       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
8879
8880       /* Put all masked stores with the same mask to STORE_BB if possible.  */
8881       while (true)
8882         {
8883           gimple_stmt_iterator gsi_from;
8884           gimple *stmt1 = NULL;
8885
8886           /* Move masked store to STORE_BB.  */
8887           last_store = last;
8888           gsi = gsi_for_stmt (last);
8889           gsi_from = gsi;
8890           /* Shift GSI to the previous stmt for further traversal.  */
8891           gsi_prev (&gsi);
8892           gsi_to = gsi_start_bb (store_bb);
8893           gsi_move_before (&gsi_from, &gsi_to);
8894           /* Setup GSI_TO to the non-empty block start.  */
8895           gsi_to = gsi_start_bb (store_bb);
8896           if (dump_enabled_p ())
8897             {
8898               dump_printf_loc (MSG_NOTE, vect_location,
8899                                "Move stmt to created bb\n");
8900               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, last, 0);
8901             }
8902           /* Move all stored value producers if possible.  */
8903           while (!gsi_end_p (gsi))
8904             {
8905               tree lhs;
8906               imm_use_iterator imm_iter;
8907               use_operand_p use_p;
8908               bool res;
8909
8910               /* Skip debug statements.  */
8911               if (is_gimple_debug (gsi_stmt (gsi)))
8912                 {
8913                   gsi_prev (&gsi);
8914                   continue;
8915                 }
8916               stmt1 = gsi_stmt (gsi);
8917               /* Do not consider statements writing to memory or having
8918                  volatile operand.  */
8919               if (gimple_vdef (stmt1)
8920                   || gimple_has_volatile_ops (stmt1))
8921                 break;
8922               gsi_from = gsi;
8923               gsi_prev (&gsi);
8924               lhs = gimple_get_lhs (stmt1);
8925               if (!lhs)
8926                 break;
8927
8928               /* LHS of vectorized stmt must be SSA_NAME.  */
8929               if (TREE_CODE (lhs) != SSA_NAME)
8930                 break;
8931
8932               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8933                 {
8934                   /* Remove dead scalar statement.  */
8935                   if (has_zero_uses (lhs))
8936                     {
8937                       gsi_remove (&gsi_from, true);
8938                       continue;
8939                     }
8940                 }
8941
8942               /* Check that LHS does not have uses outside of STORE_BB.  */
8943               res = true;
8944               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
8945                 {
8946                   gimple *use_stmt;
8947                   use_stmt = USE_STMT (use_p);
8948                   if (is_gimple_debug (use_stmt))
8949                     continue;
8950                   if (gimple_bb (use_stmt) != store_bb)
8951                     {
8952                       res = false;
8953                       break;
8954                     }
8955                 }
8956               if (!res)
8957                 break;
8958
8959               if (gimple_vuse (stmt1)
8960                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
8961                 break;
8962
8963               /* Can move STMT1 to STORE_BB.  */
8964               if (dump_enabled_p ())
8965                 {
8966                   dump_printf_loc (MSG_NOTE, vect_location,
8967                                    "Move stmt to created bb\n");
8968                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt1, 0);
8969                 }
8970               gsi_move_before (&gsi_from, &gsi_to);
8971               /* Shift GSI_TO for further insertion.  */
8972               gsi_prev (&gsi_to);
8973             }
8974           /* Put other masked stores with the same mask to STORE_BB.  */
8975           if (worklist.is_empty ()
8976               || gimple_call_arg (worklist.last (), 2) != mask
8977               || worklist.last () != stmt1)
8978             break;
8979           last = worklist.pop ();
8980         }
8981       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
8982     }
8983 }